aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll89
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-binop-s64-with-s32-mask.mir321
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll395
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir28
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir65
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir84
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir118
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir118
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll473
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll1096
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll1219
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll697
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll1194
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll610
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll1168
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll1062
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll757
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll652
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll429
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir77
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.getpc.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitcast.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-brcond.mir47
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.mir101
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir391
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll224
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll237
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll1123
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll1062
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll728
-rw-r--r--llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll1539
-rw-r--r--llvm/test/CodeGen/AMDGPU/acc-ldst.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_i64.ll137
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll134
-rw-r--r--llvm/test/CodeGen/AMDGPU/allow-check.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll11901
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll1148
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll1320
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll2886
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll240
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll5414
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll637
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll594
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll101
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir121
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll87
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll39
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/ashr64_reduce_flags.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll489
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll426
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll1486
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll213
-rw-r--r--llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir136
-rw-r--r--llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-conversions.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-math.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll1155
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitop3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/br_cc.f16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir36
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll93
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll1281
-rw-r--r--llvm/test/CodeGen/AMDGPU/change-scc-to-vcc.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp-modifier.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir192
-rw-r--r--llvm/test/CodeGen/AMDGPU/collapse-endcf.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dagcombine-select.ll220
-rw-r--r--llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll29
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll144
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp64_combine.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir18
-rw-r--r--llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir18
-rw-r--r--llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll70
-rw-r--r--llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/empty-text.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll116
-rw-r--r--llvm/test/CodeGen/AMDGPU/extra-lds-size.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll133
-rw-r--r--llvm/test/CodeGen/AMDGPU/finalizebundle.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir7
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll241
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll241
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll271
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll1658
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll194
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat-scratch.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics.ll172
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll2952
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll162
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll5092
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll6014
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll4060
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll114
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll311
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir334
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir133
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir3
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll307
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoi.i128.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll183
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll4463
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll251
-rw-r--r--llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll468
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx10plus-wavefront-sgpr-count.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll106
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll106
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll363
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll162
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics.ll398
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll3390
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll162
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll4152
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clause-limit-attr.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clause-limit.mir30
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir608
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir38
-rw-r--r--llvm/test/CodeGen/AMDGPU/hard-clauses.mir46
-rw-r--r--llvm/test/CodeGen/AMDGPU/hazard-getreg-waitalu.mir91
-rw-r--r--llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir549
-rw-r--r--llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll45
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm.ll761
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm16.ll395
-rw-r--r--llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir57
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir394
-rw-r--r--llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir1174
-rw-r--r--llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll71
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint-err.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint.ll217
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/inline-attr.ll19
-rw-r--r--llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir123
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll321
-rw-r--r--llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll237
-rw-r--r--llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll1011
-rw-r--r--llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll256
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll192
-rw-r--r--llvm/test/CodeGen/AMDGPU/itofp.i128.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-size-pal-metadata.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll173
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll59
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll85
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll176
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll45
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll313
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll133
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll308
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll232
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll45
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll132
-rw-r--r--llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll116
-rw-r--r--llvm/test/CodeGen/AMDGPU/literal64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir91
-rw-r--r--llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llc-pipeline.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll275
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll129
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll1524
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll578
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll705
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll99
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll705
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll218
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll143
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll23
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt_gfx1250.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll114
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll868
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll111
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll868
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll104
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll420
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll420
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll98
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll123
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll1765
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i32.ll715
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll68
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll68
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll50
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir256
-rw-r--r--llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir8
-rw-r--r--llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad.u16.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll317
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll183
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll171
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll239
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll1651
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll89
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll1241
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll1709
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll1225
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll1223
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll1613
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll73
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll1241
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll1586
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll87
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll1241
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll1271
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll1211
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll69
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll1165
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll1211
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll1165
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll1211
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll23434
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll23329
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll22387
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll23329
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll23375
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-flat-saddr-load-store.mir338
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir64
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir618
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll119
-rw-r--r--llvm/test/CodeGen/AMDGPU/minmax.ll344
-rw-r--r--llvm/test/CodeGen/AMDGPU/mmra.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir10
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll33
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir44
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-flat.ll58
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-fp32.ll917
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll208
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir108
-rw-r--r--llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir91
-rw-r--r--llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/pr155452.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll193
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll336
-rw-r--r--llvm/test/CodeGen/AMDGPU/preserve-hi16.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll244
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll33
-rw-r--r--llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/readcyclecounter.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir57
-rw-r--r--llvm/test/CodeGen/AMDGPU/rem_i128.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/remat-vop.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir261
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll181
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir226
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir333
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll996
-rw-r--r--llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll31
-rw-r--r--llvm/test/CodeGen/AMDGPU/s-barrier.ll271
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/saddsat.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll43
-rw-r--r--llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir15
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir64
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir71
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll155
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/srl64_reduce_flags.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ssubsat.ll378
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-to-constant.ll186
-rw-r--r--llvm/test/CodeGen/AMDGPU/structurize-hoist.ll91
-rw-r--r--llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/trap-abis.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll222
-rw-r--r--llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll58
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/unify-metadata.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll126
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll78
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll296
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-remat.mir45
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir62
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll1424
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.mir277
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll217
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved.ll40
512 files changed, 219356 insertions, 38019 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
index b68df4f..5903633 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=VI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=fiji < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
; ===================================================================================
; V_ADD_LSHL_U32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
index 523d51d..fc23614 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll
@@ -1,12 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s
; This file checks that the translation from llvm IR to generic
; MachineInstr is correct.
; Tests for add.
-; CHECK: name: addi32
-; CHECK: {{%[0-9]+}}:_(s32) = G_ADD
-define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
+define void @addi32(i32 %arg1, i32 %arg2) {
+ ; CHECK-LABEL: name: addi32
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
%res = add i32 %arg1, %arg2
store i32 %res, ptr addrspace(1) poison
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll
index 74422a1..25d7000 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=WAVE64 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=WAVE32 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=WAVE32 %s
define i32 @s_andn2_i1_vcc(i32 %arg0, i32 %arg1) {
; WAVE64-LABEL: s_andn2_i1_vcc:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index cdcc3a4..fae3a75 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
define hidden <2 x i64> @icmp_v2i32_sext_to_v2i64(<2 x i32> %arg) {
; CHECK-LABEL: icmp_v2i32_sext_to_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index a86939f..dac726d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -91,7 +91,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -175,7 +175,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -233,7 +233,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -291,7 +291,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -368,7 +368,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -451,7 +451,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -534,7 +534,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -603,7 +603,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -676,7 +676,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -749,7 +749,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -846,7 +846,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -930,7 +930,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1021,7 +1021,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -1119,8 +1119,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -1218,8 +1218,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -1299,7 +1299,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1384,8 +1384,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1470,8 +1470,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1599,8 +1599,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%out.gep = getelementptr i32, ptr %out, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out.gep, align 4
ret void
}
@@ -1706,8 +1706,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1813,7 +1813,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -1926,8 +1926,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -2012,7 +2012,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2102,8 +2102,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2193,8 +2193,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2333,8 +2333,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out.gep, align 4
ret void
}
@@ -2444,8 +2444,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -2540,7 +2540,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
- %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
store i32 %result, ptr addrspace(1) %out, align 4
ret void
@@ -2629,7 +2629,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -2781,7 +2781,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2844,7 +2844,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2926,7 +2926,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -3014,7 +3014,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -3102,7 +3102,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -3176,7 +3176,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3254,7 +3254,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3332,7 +3332,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3434,7 +3434,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -3523,7 +3523,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -3624,7 +3624,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
- %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
store i64 %result, ptr addrspace(1) %out, align 4
ret void
@@ -3635,6 +3635,7 @@ attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
+!1 = !{}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 7958e40..77d212a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -424,7 +424,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -518,7 +518,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -614,7 +614,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -693,7 +693,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -776,7 +776,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -861,7 +861,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -971,7 +971,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -1067,7 +1067,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -1625,7 +1625,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -1725,7 +1725,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
@@ -1912,7 +1912,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2001,7 +2001,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2092,7 +2092,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2208,7 +2208,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
store i64 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
@@ -2310,7 +2310,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 syncscope("agent") seq_cst, align 8, !amdgpu.no.remote.memory !1
ret void
}
@@ -2414,7 +2414,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -2525,8 +2525,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -2639,8 +2639,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out, align 4
ret void
}
@@ -2731,7 +2731,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
+ %result = atomicrmw uinc_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -2827,8 +2827,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -2926,8 +2926,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -3077,8 +3077,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%out.gep = getelementptr i32, ptr %out, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
store i32 %result, ptr %out.gep, align 4
ret void
}
@@ -3201,8 +3201,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
+ %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !1
ret void
}
@@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -3571,8 +3571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -3701,8 +3701,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out, align 4
ret void
}
@@ -3799,7 +3799,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -3901,8 +3901,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -4006,8 +4006,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -4169,8 +4169,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
store i64 %result, ptr %out.gep, align 4
ret void
}
@@ -4297,8 +4297,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX12-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
+ %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0, !amdgpu.no.remote.memory !1
ret void
}
@@ -4434,6 +4434,7 @@ attributes #1 = { nounwind }
attributes #2 = { nounwind memory(none) }
!0 = !{i32 5, i32 6}
+!1 = !{}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
index 439ffba..22324e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=WAVE64 %s
-; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck -check-prefix=WAVE32 %s
+; RUN: llc -global-isel -new-reg-bank-select -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=WAVE64 %s
+; RUN: llc -global-isel -new-reg-bank-select -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck -check-prefix=WAVE32 %s
; This was mishandling the constant true and false values used as a
; scalar branch condition.
@@ -76,7 +76,8 @@ define void @br_undef() {
; WAVE64-NEXT: .LBB2_1: ; %bb0
; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1
; WAVE64-NEXT: ; implicit-def: $sgpr4
-; WAVE64-NEXT: s_and_b32 s4, s4, 1
+; WAVE64-NEXT: s_mov_b32 s5, 1
+; WAVE64-NEXT: s_and_b32 s4, s4, s5
; WAVE64-NEXT: s_cmp_lg_u32 s4, 0
; WAVE64-NEXT: s_cbranch_scc1 .LBB2_1
; WAVE64-NEXT: ; %bb.2: ; %.exit5
@@ -88,7 +89,8 @@ define void @br_undef() {
; WAVE32-NEXT: .LBB2_1: ; %bb0
; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1
; WAVE32-NEXT: ; implicit-def: $sgpr4
-; WAVE32-NEXT: s_and_b32 s4, s4, 1
+; WAVE32-NEXT: s_mov_b32 s5, 1
+; WAVE32-NEXT: s_and_b32 s4, s4, s5
; WAVE32-NEXT: s_cmp_lg_u32 s4, 0
; WAVE32-NEXT: s_cbranch_scc1 .LBB2_1
; WAVE32-NEXT: ; %bb.2: ; %.exit5
@@ -110,7 +112,8 @@ define void @br_poison() {
; WAVE64-NEXT: .LBB3_1: ; %bb0
; WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1
; WAVE64-NEXT: ; implicit-def: $sgpr4
-; WAVE64-NEXT: s_and_b32 s4, s4, 1
+; WAVE64-NEXT: s_mov_b32 s5, 1
+; WAVE64-NEXT: s_and_b32 s4, s4, s5
; WAVE64-NEXT: s_cmp_lg_u32 s4, 0
; WAVE64-NEXT: s_cbranch_scc1 .LBB3_1
; WAVE64-NEXT: ; %bb.2: ; %.exit5
@@ -122,7 +125,8 @@ define void @br_poison() {
; WAVE32-NEXT: .LBB3_1: ; %bb0
; WAVE32-NEXT: ; =>This Inner Loop Header: Depth=1
; WAVE32-NEXT: ; implicit-def: $sgpr4
-; WAVE32-NEXT: s_and_b32 s4, s4, 1
+; WAVE32-NEXT: s_mov_b32 s5, 1
+; WAVE32-NEXT: s_and_b32 s4, s4, s5
; WAVE32-NEXT: s_cmp_lg_u32 s4, 0
; WAVE32-NEXT: s_cbranch_scc1 .LBB3_1
; WAVE32-NEXT: ; %bb.2: ; %.exit5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-binop-s64-with-s32-mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-binop-s64-with-s32-mask.mir
new file mode 100644
index 0000000..9259678
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-binop-s64-with-s32-mask.mir
@@ -0,0 +1,321 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_and_mask_hi_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_hi_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -4294967296
+ %2:_(s64) = G_AND %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_hi_lhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_hi_lhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -4294967296
+ %2:_(s64) = G_AND %1, %0
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_hi_48bit_mask_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_hi_48bit_mask_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65536
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -65536
+ %2:_(s64) = G_AND %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_hi_16bit_mask_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_hi_16bit_mask_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -281474976710656
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -281474976710656
+ %2:_(s64) = G_AND %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_lo_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_lo_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 4294967295
+ %2:_(s64) = G_AND %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_lo_lhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_lo_lhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 4294967295
+ %2:_(s64) = G_AND %1, %0
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_lo_36bit_mask_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_lo_36bit_mask_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[AND]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 68719476735
+ %2:_(s64) = G_AND %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_and_mask_hi_with_merge_unmerge
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_and_mask_hi_with_merge_unmerge
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s64) = G_CONSTANT i64 -4294967296
+ %4:_(s64) = G_AND %2, %3
+ %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4(s64)
+ $sgpr0 = COPY %5(s32)
+ $sgpr1 = COPY %6(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_and_test_incorrect_types
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: negative_and_test_incorrect_types
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -4294967296
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s128) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[AND]](s128)
+ %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %1:_(s64) = COPY $vgpr4_vgpr5
+ %2:_(s128) = G_CONSTANT i128 -4294967296
+ %3:_(s128) = G_AND %0, %2
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+---
+name: test_or_mask_hi_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_or_mask_hi_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -4294967296
+ %2:_(s64) = G_OR %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_or_mask_hi_lhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_or_mask_hi_lhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 -4294967296
+ %2:_(s64) = G_OR %1, %0
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_or_mask_lo_rhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_or_mask_lo_rhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 4294967295
+ %2:_(s64) = G_OR %0, %1
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_or_mask_lo_lhs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_or_mask_lo_lhs
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
+ ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 4294967295
+ %2:_(s64) = G_OR %1, %0
+ $sgpr0_sgpr1 = COPY %2(s64)
+ SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_or_mask_hi_with_merge_unmerge
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-LABEL: name: test_or_mask_hi_with_merge_unmerge
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+ ; CHECK-NEXT: $sgpr0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $sgpr1 = COPY [[C]](s32)
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+ %3:_(s64) = G_CONSTANT i64 -4294967296
+ %4:_(s64) = G_OR %2, %3
+ %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4(s64)
+ $sgpr0 = COPY %5(s32)
+ $sgpr1 = COPY %6(s32)
+ SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_or_test_incorrect_types
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+
+ ; CHECK-LABEL: name: negative_or_test_incorrect_types
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -4294967296
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[C]]
+ ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+ %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %1:_(s64) = COPY $vgpr4_vgpr5
+ %2:_(s128) = G_CONSTANT i128 -4294967296
+ %3:_(s128) = G_OR %0, %2
+ $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
index 2d3088f..917cdb3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
index 5532443..914a26b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_and_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_and_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index ff26ea2..5dff8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,54 +227,52 @@ exit:
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
; GFX10-LABEL: single_lane_execution_attribute:
; GFX10: ; %bb.0: ; %.entry
-; GFX10-NEXT: s_mov_b32 s6, 0
-; GFX10-NEXT: s_getpc_b64 s[4:5]
-; GFX10-NEXT: s_mov_b32 s7, -1
-; GFX10-NEXT: s_mov_b32 s2, s1
-; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
-; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_getpc_b64 s[12:13]
+; GFX10-NEXT: s_mov_b32 s12, 0
+; GFX10-NEXT: s_mov_b32 s2, s0
+; GFX10-NEXT: s_mov_b32 s3, s12
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
+; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo
+; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT: s_and_b32 vcc_lo, s2, exec_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
-; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT: s_mov_b32 s3, 0
+; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v3, s12
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT: s_add_i32 s1, s1, 4
+; GFX10-NEXT: s_add_i32 s12, s12, 4
; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s12, v3
-; GFX10-NEXT: s_add_i32 s3, s12, s3
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
-; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX10-NEXT: s_branch .LBB4_6
; GFX10-NEXT: .LBB4_4:
-; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, exec_lo
; GFX10-NEXT: ; implicit-def: $vgpr1
-; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
; GFX10-NEXT: ; %bb.5: ; %.19
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: v_or_b32_e32 v1, 2, v1
; GFX10-NEXT: .LBB4_6: ; %.22
-; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
+; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
; GFX10-NEXT: s_endpgm
.entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index a8a75cd..dd01112 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; This file contains various tests that have divergent i1s used outside of
; the loop. These are lane masks is sgpr and need to have correct value in
@@ -13,30 +13,27 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-LABEL: divergent_i1_phi_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GFX10-NEXT: s_xor_b32 s8, s5, -1
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: s_xor_b32 s8, s5, s8
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: s_or_b32 s7, s7, s5
+; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
; GFX10-NEXT: s_mov_b32 s5, s8
-; GFX10-NEXT: s_and_b32 s9, exec_lo, s7
-; GFX10-NEXT: s_or_b32 s6, s6, s9
+; GFX10-NEXT: s_or_b32 s7, s7, s9
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -63,43 +60,44 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: ; implicit-def: $sgpr6
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo
-; GFX10-NEXT: s_and_b32 s4, exec_lo, -1
-; GFX10-NEXT: s_or_b32 s4, s5, s4
+; GFX10-NEXT: s_and_b32 s6, exec_lo, exec_lo
+; GFX10-NEXT: s_mov_b32 s4, -1
+; GFX10-NEXT: s_or_b32 s7, s5, s6
+; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB1_2
; GFX10-NEXT: .LBB1_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
-; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo
-; GFX10-NEXT: s_and_b32 s8, exec_lo, s6
-; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
-; GFX10-NEXT: s_or_b32 s4, s7, s8
-; GFX10-NEXT: s_cbranch_vccz .LBB1_4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
+; GFX10-NEXT: s_add_i32 s4, s4, 1
+; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, 4
+; GFX10-NEXT: s_cmp_ge_i32 s4, 10
+; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT: s_cselect_b32 s8, 1, 0
+; GFX10-NEXT: s_andn2_b32 s7, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s7, s7, s9
+; GFX10-NEXT: s_cmp_lg_u32 s8, 0
+; GFX10-NEXT: s_cbranch_scc0 .LBB1_4
; GFX10-NEXT: .LBB1_2: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mov_b32 s5, s4
-; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
-; GFX10-NEXT: s_and_b32 s6, exec_lo, s5
-; GFX10-NEXT: s_or_b32 s6, s4, s6
-; GFX10-NEXT: s_and_saveexec_b32 s4, s5
+; GFX10-NEXT: s_mov_b32 s6, s7
+; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo
+; GFX10-NEXT: s_and_b32 s7, exec_lo, s7
+; GFX10-NEXT: s_or_b32 s5, s5, s7
+; GFX10-NEXT: s_and_saveexec_b32 s7, s6
; GFX10-NEXT: s_cbranch_execz .LBB1_1
; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
-; GFX10-NEXT: global_load_dword v5, v[1:2], off
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: global_load_dword v0, v[1:2], off
+; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s6, s6, s7
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: s_and_b32 s8, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s5, s5, s8
; GFX10-NEXT: s_branch .LBB1_1
; GFX10-NEXT: .LBB1_4: ; %exit
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
; GFX10-NEXT: flat_store_dword v[3:4], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -135,29 +133,26 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GFX10-NEXT: s_xor_b32 s5, s5, -1
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: s_xor_b32 s5, s5, s8
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, s5
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
; GFX10-NEXT: s_or_b32 s7, s7, s8
-; GFX10-NEXT: s_and_b32 s8, exec_lo, s7
-; GFX10-NEXT: s_or_b32 s6, s6, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -184,23 +179,20 @@ define void @divergent_i1_xor_used_outside_loop_twice(float %val, float %pre.con
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_twice:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: ; implicit-def: $sgpr6
-; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB3_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
-; GFX10-NEXT: s_xor_b32 s5, s5, -1
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-NEXT: s_add_i32 s7, s7, 1
+; GFX10-NEXT: s_xor_b32 s5, s5, s8
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
-; GFX10-NEXT: s_and_b32 s8, exec_lo, s5
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: s_or_b32 s7, s7, s8
-; GFX10-NEXT: s_and_b32 s8, exec_lo, s7
+; GFX10-NEXT: s_and_b32 s8, exec_lo, s5
; GFX10-NEXT: s_or_b32 s6, s6, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -247,66 +239,64 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
-; GFX10-NEXT: ; implicit-def: $sgpr6
-; GFX10-NEXT: ; implicit-def: $sgpr8
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: ; implicit-def: $sgpr10
+; GFX10-NEXT: ; implicit-def: $sgpr11
; GFX10-NEXT: ; implicit-def: $sgpr9
-; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB4_3
; GFX10-NEXT: .LBB4_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s10
-; GFX10-NEXT: s_xor_b32 s10, s9, -1
-; GFX10-NEXT: s_and_b32 s11, exec_lo, s8
-; GFX10-NEXT: s_or_b32 s5, s11, s5
-; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
-; GFX10-NEXT: s_and_b32 s10, exec_lo, s10
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: s_or_b32 s7, s7, s10
-; GFX10-NEXT: s_and_b32 s10, exec_lo, s7
-; GFX10-NEXT: s_or_b32 s6, s6, s10
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_xor_b32 s5, s11, exec_lo
+; GFX10-NEXT: s_and_b32 s12, exec_lo, s10
+; GFX10-NEXT: s_or_b32 s8, s12, s8
+; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s9, s9, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execz .LBB4_5
; GFX10-NEXT: .LBB4_3: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
-; GFX10-NEXT: s_and_b32 s10, exec_lo, -1
-; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
-; GFX10-NEXT: s_or_b32 s9, s9, s10
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
-; GFX10-NEXT: s_or_b32 s8, s8, s10
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
-; GFX10-NEXT: global_load_dword v6, v[6:7], off
+; GFX10-NEXT: s_ashr_i32 s5, s4, 31
+; GFX10-NEXT: s_andn2_b32 s10, s10, exec_lo
+; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], 2
+; GFX10-NEXT: s_andn2_b32 s5, s11, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v5, s12
+; GFX10-NEXT: v_mov_b32_e32 v6, s13
+; GFX10-NEXT: s_and_b32 s11, exec_lo, exec_lo
+; GFX10-NEXT: s_and_b32 s12, exec_lo, exec_lo
+; GFX10-NEXT: s_or_b32 s11, s5, s11
+; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v1, v5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v6, vcc_lo
+; GFX10-NEXT: s_or_b32 s10, s10, s12
+; GFX10-NEXT: global_load_dword v5, v[5:6], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT: s_and_saveexec_b32 s10, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB4_2
; GFX10-NEXT: ; %bb.4: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
-; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
-; GFX10-NEXT: s_and_b32 s11, exec_lo, 0
-; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: s_and_b32 s12, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s9, s9, s11
-; GFX10-NEXT: s_or_b32 s8, s8, s12
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: s_andn2_b32 s11, s11, exec_lo
+; GFX10-NEXT: s_and_b32 s12, exec_lo, 0
+; GFX10-NEXT: s_andn2_b32 s10, s10, exec_lo
+; GFX10-NEXT: s_add_i32 s4, s4, 1
+; GFX10-NEXT: s_and_b32 s13, exec_lo, vcc_lo
+; GFX10-NEXT: s_or_b32 s11, s11, s12
+; GFX10-NEXT: s_or_b32 s10, s10, s13
; GFX10-NEXT: s_branch .LBB4_2
; GFX10-NEXT: .LBB4_5: ; %loop.exit.guard
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
-; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
-; GFX10-NEXT: s_or_b32 s6, s5, s6
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s9
+; GFX10-NEXT: s_or_b32 s6, s4, s5
; GFX10-NEXT: .LBB4_6: ; %Flow1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
; GFX10-NEXT: s_cbranch_execz .LBB4_8
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
@@ -355,56 +345,54 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ; implicit-def: $sgpr5
-; GFX10-NEXT: ; implicit-def: $sgpr6
-; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB5_2
; GFX10-NEXT: .LBB5_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_and_b32 s7, exec_lo, s7
-; GFX10-NEXT: s_or_b32 s4, s7, s4
-; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo
-; GFX10-NEXT: s_and_b32 s7, exec_lo, s6
-; GFX10-NEXT: s_or_b32 s5, s5, s7
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s6, s5, s6
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; GFX10-NEXT: s_cbranch_execz .LBB5_6
; GFX10-NEXT: .LBB5_2: ; %cond.block.0
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
-; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s4
+; GFX10-NEXT: s_andn2_b32 s5, s7, exec_lo
; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s6, s6, s7
-; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
+; GFX10-NEXT: s_or_b32 s7, s5, s7
+; GFX10-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB5_4
; GFX10-NEXT: ; %bb.3: ; %if.block.0
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
+; GFX10-NEXT: s_ashr_i32 s5, s4, 31
+; GFX10-NEXT: v_mov_b32_e32 v5, s4
+; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], 2
+; GFX10-NEXT: v_mov_b32_e32 v8, s10
+; GFX10-NEXT: v_mov_b32_e32 v9, s11
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v3, v9, vcc_lo
-; GFX10-NEXT: global_store_dword v[8:9], v4, off
+; GFX10-NEXT: global_store_dword v[8:9], v5, off
; GFX10-NEXT: .LBB5_4: ; %loop.break.block
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v4
-; GFX10-NEXT: s_mov_b32 s7, -1
-; GFX10-NEXT: ; implicit-def: $vgpr5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v1
+; GFX10-NEXT: s_mov_b32 s5, exec_lo
; GFX10-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB5_1
; GFX10-NEXT: ; %bb.5: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
-; GFX10-NEXT: s_andn2_b32 s7, -1, exec_lo
+; GFX10-NEXT: s_andn2_b32 s5, s5, exec_lo
; GFX10-NEXT: s_and_b32 s9, exec_lo, 0
-; GFX10-NEXT: s_or_b32 s7, s7, s9
+; GFX10-NEXT: s_add_i32 s4, s4, 1
+; GFX10-NEXT: s_or_b32 s5, s5, s9
; GFX10-NEXT: s_branch .LBB5_1
; GFX10-NEXT: .LBB5_6: ; %cond.block.1
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s5
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT: s_and_saveexec_b32 s4, s7
; GFX10-NEXT: s_cbranch_execz .LBB5_8
; GFX10-NEXT: ; %bb.7: ; %if.block.1
; GFX10-NEXT: global_store_dword v[6:7], v4, off
@@ -468,52 +456,50 @@ exit:
define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) {
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s1, exec_lo
+; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_mov_b32 s0, 0
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
+; GFX10-NEXT: ; implicit-def: $sgpr4
; GFX10-NEXT: ; implicit-def: $sgpr3
-; GFX10-NEXT: ; implicit-def: $sgpr1
-; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: s_branch .LBB6_2
; GFX10-NEXT: .LBB6_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5
-; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s1
-; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_or_b32 s2, s2, s5
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, s0, v0
+; GFX10-NEXT: s_add_i32 s0, s0, 1
+; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
-; GFX10-NEXT: s_and_b32 s6, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s4, s4, s5
-; GFX10-NEXT: s_or_b32 s3, s3, s6
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s4
+; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_or_b32 s3, s3, s5
+; GFX10-NEXT: s_or_b32 s1, s1, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_execz .LBB6_4
; GFX10-NEXT: .LBB6_2: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s4
-; GFX10-NEXT: s_or_b32 s1, s1, s5
-; GFX10-NEXT: s_and_saveexec_b32 s5, s4
+; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s1
+; GFX10-NEXT: s_or_b32 s4, s4, s5
+; GFX10-NEXT: s_and_saveexec_b32 s5, s1
; GFX10-NEXT: s_cbranch_execz .LBB6_1
; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
-; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
-; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v1, v6
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
-; GFX10-NEXT: global_load_dword v6, v[6:7], off
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 2
+; GFX10-NEXT: s_andn2_b32 s1, s4, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v5, s6
+; GFX10-NEXT: v_mov_b32_e32 v6, s7
+; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v1, v5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v6, vcc_lo
+; GFX10-NEXT: global_load_dword v5, v[5:6], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
; GFX10-NEXT: s_and_b32 s4, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s1, s1, s4
-; GFX10-NEXT: ; implicit-def: $sgpr4
+; GFX10-NEXT: s_or_b32 s4, s1, s4
+; GFX10-NEXT: ; implicit-def: $sgpr1
; GFX10-NEXT: s_branch .LBB6_1
; GFX10-NEXT: .LBB6_4: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s3
; GFX10-NEXT: flat_store_dword v[3:4], v0
; GFX10-NEXT: s_endpgm
@@ -548,64 +534,67 @@ exit:
define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a, ptr addrspace(1) %a.break) {
; GFX10-LABEL: loop_with_1break:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s0, 0
-; GFX10-NEXT: ; implicit-def: $sgpr1
-; GFX10-NEXT: ; implicit-def: $sgpr3
-; GFX10-NEXT: ; implicit-def: $sgpr4
-; GFX10-NEXT: ; implicit-def: $sgpr2
-; GFX10-NEXT: v_mov_b32_e32 v6, s0
+; GFX10-NEXT: ; implicit-def: $sgpr6
+; GFX10-NEXT: ; implicit-def: $sgpr7
+; GFX10-NEXT: ; implicit-def: $sgpr5
; GFX10-NEXT: s_branch .LBB7_2
; GFX10-NEXT: .LBB7_1: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s3
-; GFX10-NEXT: s_or_b32 s0, s5, s0
-; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s4
-; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
-; GFX10-NEXT: s_or_b32 s2, s2, s5
-; GFX10-NEXT: s_and_b32 s5, exec_lo, s2
-; GFX10-NEXT: s_or_b32 s1, s1, s5
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT: s_and_b32 s1, exec_lo, s6
+; GFX10-NEXT: s_or_b32 s4, s1, s4
+; GFX10-NEXT: s_andn2_b32 s1, s5, exec_lo
+; GFX10-NEXT: s_and_b32 s2, exec_lo, s7
+; GFX10-NEXT: s_or_b32 s5, s1, s2
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execz .LBB7_4
; GFX10-NEXT: .LBB7_2: ; %A
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_and_b32 s5, exec_lo, -1
-; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
-; GFX10-NEXT: s_or_b32 s4, s4, s5
-; GFX10-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7]
-; GFX10-NEXT: s_or_b32 s3, s3, s5
-; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7
-; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
-; GFX10-NEXT: global_load_dword v9, v[9:10], off
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[0:1], 2
+; GFX10-NEXT: s_andn2_b32 s1, s7, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v7, s3
+; GFX10-NEXT: v_mov_b32_e32 v6, s2
+; GFX10-NEXT: s_and_b32 s7, exec_lo, s8
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: s_and_b32 s8, exec_lo, exec_lo
+; GFX10-NEXT: s_or_b32 s7, s1, s7
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v2, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v7, vcc_lo
+; GFX10-NEXT: s_or_b32 s6, s6, s8
+; GFX10-NEXT: global_load_dword v6, v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB7_1
; GFX10-NEXT: ; %bb.3: ; %loop.body
; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
-; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
-; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
-; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX10-NEXT: global_load_dword v9, v[7:8], off
-; GFX10-NEXT: s_and_b32 s6, exec_lo, 0
-; GFX10-NEXT: v_mov_b32_e32 v6, v10
-; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
-; GFX10-NEXT: s_and_b32 s7, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s4, s4, s6
-; GFX10-NEXT: s_or_b32 s3, s3, s7
+; GFX10-NEXT: v_mov_b32_e32 v7, s3
+; GFX10-NEXT: v_mov_b32_e32 v6, s2
+; GFX10-NEXT: s_add_i32 s2, s0, 1
+; GFX10-NEXT: s_cmpk_lt_u32 s0, 0x64
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
+; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v0, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
+; GFX10-NEXT: s_andn2_b32 s3, s7, exec_lo
+; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
+; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT: global_load_dword v8, v[6:7], off
+; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
+; GFX10-NEXT: s_or_b32 s7, s3, s7
+; GFX10-NEXT: s_or_b32 s6, s6, s0
+; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
-; GFX10-NEXT: global_store_dword v[7:8], v9, off
+; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT: global_store_dword v[6:7], v8, off
; GFX10-NEXT: s_branch .LBB7_1
; GFX10-NEXT: .LBB7_4: ; %loop.exit.guard
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX10-NEXT: s_and_saveexec_b32 s0, s1
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s0, s5
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB7_6
; GFX10-NEXT: ; %bb.5: ; %break.body
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index e6e98fb..206011a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index 4fdb408..b520ce1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=MESA %s
-; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=PAL %s
+; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=MESA %s
+; RUN: llc -global-isel -new-reg-bank-select -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=PAL %s
; Test that the initialization for flat_scratch doesn't crash. PAL
; doesn't add a user SGPR for initializing flat_scratch, mesa does
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a066b15..e6a8bac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX942-LABEL: store_load_large_imm_offset_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s0, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX942-LABEL: store_load_large_imm_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_add_i32 s1, s32, s0
; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s1, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s32, s0
+; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX9: ; %bb.0: ; %bb
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX10: ; %bb.0: ; %bb
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
index 6792612d..7766b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll
@@ -108,7 +108,7 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i32 256
+ %gep = getelementptr inbounds <2 x half>, ptr %ptr, i32 256
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
ret <2 x half> %result
}
@@ -122,7 +122,7 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NEXT: buffer_inv sc1
; GFX942-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i32 256
+ %gep = getelementptr inbounds <2 x half>, ptr %ptr, i32 256
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 2785b78..481a254 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -2243,36 +2243,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
-; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1250-NEXT: s_cbranch_execz .LBB51_3
+; GFX1250-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-NEXT: ; %bb.1:
-; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
+; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v4, s1
-; GFX1250-NEXT: ds_load_b64 v[2:3], v4
-; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
-; GFX1250-NEXT: .LBB51_2: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
+; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB51_2
-; GFX1250-NEXT: .LBB51_3:
+; GFX1250-NEXT: .LBB51_2:
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2322,36 +2308,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
-; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1250-NEXT: s_cbranch_execz .LBB52_3
+; GFX1250-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-NEXT: ; %bb.1:
-; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
+; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v4, s1
-; GFX1250-NEXT: ds_load_b64 v[2:3], v4
-; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
-; GFX1250-NEXT: .LBB52_2: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB52_2
-; GFX1250-NEXT: .LBB52_3:
+; GFX1250-NEXT: .LBB52_2:
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2401,36 +2373,22 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
;
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
-; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX1250-NEXT: s_cbranch_execz .LBB53_3
+; GFX1250-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-NEXT: ; %bb.1:
-; GFX1250-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x24
+; GFX1250-NEXT: s_bcnt1_i32_b32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v4, s1
-; GFX1250-NEXT: ds_load_b64 v[2:3], v4
-; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
-; GFX1250-NEXT: .LBB53_2: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[6:7], v[2:3], v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[6:7], v4, v[6:7], v[2:3]
+; GFX1250-NEXT: v_dual_mul_f64 v[0:1], 4.0, v[0:1] :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX1250-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB53_2
-; GFX1250-NEXT: .LBB53_3:
+; GFX1250-NEXT: .LBB53_2:
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2459,23 +2417,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, v0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
index 85d852fc..be9de72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
@@ -153,7 +153,7 @@ body: |
%2:vgpr(s32) = COPY $vgpr3
%3:vgpr(<2 x s32>) = G_BUILD_VECTOR %1, %2
%4:vgpr(s64) = G_CONSTANT i64 4
- %5:vgpr(p0) = G_PTR_ADD %0, %4
+ %5:vgpr(p0) = inbounds G_PTR_ADD %0, %4
%6:vgpr(s32) = G_AMDGPU_ATOMIC_CMPXCHG %5, %3 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %6
@@ -305,7 +305,7 @@ body: |
%2:vgpr(s64) = COPY $vgpr4_vgpr5
%3:vgpr(<2 x s64>) = G_BUILD_VECTOR %1, %2
%4:vgpr(s64) = G_CONSTANT i64 4
- %5:vgpr(p0) = G_PTR_ADD %0, %4
+ %5:vgpr(p0) = inbounds G_PTR_ADD %0, %4
%6:vgpr(s64) = G_AMDGPU_ATOMIC_CMPXCHG %5, %3 :: (load store seq_cst (s64), addrspace 0)
$vgpr0_vgpr1 = COPY %6
@@ -406,7 +406,7 @@ body: |
%2:vgpr(s32) = COPY $vgpr3
%3:vgpr(<2 x s32>) = G_BUILD_VECTOR %1, %2
%4:vgpr(s64) = G_CONSTANT i64 -4
- %5:vgpr(p0) = G_PTR_ADD %0, %4
+ %5:vgpr(p0) = inbounds G_PTR_ADD %0, %4
%6:vgpr(s32) = G_AMDGPU_ATOMIC_CMPXCHG %5, %3 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
index dc317a8..3389ed72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
@@ -101,7 +101,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 2047
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %4
@@ -155,7 +155,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 2047
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
...
@@ -211,7 +211,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 2048
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %4
@@ -265,7 +265,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 2048
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
...
@@ -321,7 +321,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 4095
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %4
@@ -375,7 +375,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 4095
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
...
@@ -463,7 +463,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 4097
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
$vgpr0 = COPY %4
@@ -547,7 +547,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 4097
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s32) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s32), addrspace 0)
...
@@ -647,7 +647,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = COPY $vgpr2_vgpr3
%2:vgpr(s64) = G_CONSTANT i64 4095
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s64) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s64), addrspace 0)
$vgpr0_vgpr1 = COPY %4
@@ -701,7 +701,7 @@ body: |
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = COPY $vgpr2_vgpr3
%2:vgpr(s64) = G_CONSTANT i64 4095
- %3:vgpr(p0) = G_PTR_ADD %0, %2
+ %3:vgpr(p0) = inbounds G_PTR_ADD %0, %2
%4:vgpr(s64) = G_ATOMICRMW_ADD %3, %1 :: (load store seq_cst (s64), addrspace 0)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index eba64b8..5bfb2b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -492,7 +492,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -2048
- %2:vgpr(p0) = G_PTR_ADD %0, %1
+ %2:vgpr(p0) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load seq_cst (s32), align 4, addrspace 0)
$vgpr0 = COPY %3
@@ -561,7 +561,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
%0:vgpr(p0) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 4095
- %2:vgpr(p0) = G_PTR_ADD %0, %1
+ %2:vgpr(p0) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load seq_cst (s32), align 4, addrspace 0)
$vgpr0 = COPY %3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index e1325a0..532b4bf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -1191,7 +1191,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 2047
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1275,7 +1275,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 2048
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1375,7 +1375,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -2047
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1475,7 +1475,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -2048
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1559,7 +1559,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 4095
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1659,7 +1659,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 4096
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1759,7 +1759,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -4095
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1859,7 +1859,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -4096
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -1959,7 +1959,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 8191
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -2059,7 +2059,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 8192
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -2159,7 +2159,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -8191
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -2259,7 +2259,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -8192
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -2359,7 +2359,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 8388607
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
@@ -2567,7 +2567,7 @@ body: |
; GFX12-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_CONSTANT i64 -8388608
- %2:vgpr(p1) = G_PTR_ADD %0, %1
+ %2:vgpr(p1) = inbounds G_PTR_ADD %0, %1
%3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 0)
$vgpr0 = COPY %3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir
new file mode 100644
index 0000000..ace4599
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smax-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: smax_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smax_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smax_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: smax_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smax_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smax_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_I64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir
new file mode 100644
index 0000000..f341bdf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: smin_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smin_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smin_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: smin_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: smin_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: smin_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_I64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_I64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_I64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_SMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
index 33f14c1..2df27bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-smin.mir
@@ -2,6 +2,7 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
---
name: smin_s32_ss
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 6e92d85..5b65c0e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -1237,7 +1237,7 @@ body: |
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 2047
- %3:vgpr(p1) = G_PTR_ADD %0, %2
+ %3:vgpr(p1) = inbounds G_PTR_ADD %0, %2
G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
...
@@ -1337,7 +1337,7 @@ body: |
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 8388607
- %3:vgpr(p1) = G_PTR_ADD %0, %2
+ %3:vgpr(p1) = inbounds G_PTR_ADD %0, %2
G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
...
@@ -1545,7 +1545,7 @@ body: |
%0:vgpr(p1) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_CONSTANT i64 -8388608
- %3:vgpr(p1) = G_PTR_ADD %0, %2
+ %3:vgpr(p1) = inbounds G_PTR_ADD %0, %2
G_STORE %1, %3 :: (store (s32), align 4, addrspace 0)
...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir
new file mode 100644
index 0000000..9edcf57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umax-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: umax_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umax_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umax_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: umax_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umax_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umax_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MAX_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MAX_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MAX_U64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMAX %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir
new file mode 100644
index 0000000..e6c6811
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-umin-64.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1250 -run-pass=instruction-select %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: umin_s64_sv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umin_s64_sv
+ ; GCN: liveins: $sgpr0_sgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:sgpr(s64) = COPY $sgpr0_sgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umin_s64_vs
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-LABEL: name: umin_s64_vs
+ ; GCN: liveins: $vgpr0_vgpr1, $sgpr2_sgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $sgpr2_sgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:sgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $sgpr2_sgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
+
+---
+name: umin_s64_vv
+legalized: true
+regBankSelected: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-LABEL: name: umin_s64_vv
+ ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY $vgpr2_vgpr3
+ ; GCN-NEXT: [[V_MIN_U64_e64_:%[0-9]+]]:vreg_64_align2 = V_MIN_U64_e64 [[COPY]], [[COPY1]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MIN_U64_e64_]]
+ %0:vgpr(s64) = COPY $vgpr0_vgpr1
+ %1:vgpr(s64) = COPY $vgpr2_vgpr3
+ %2:vgpr(s64) = G_UMIN %0, %1
+ S_ENDPGM 0, implicit %2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 6a4522f..d69a3e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -141,11 +141,11 @@ body: |
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -157,9 +157,9 @@ body: |
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -210,11 +210,11 @@ body: |
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -226,9 +226,9 @@ body: |
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
@@ -354,20 +354,20 @@ body: |
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; SIVI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; SIVI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]]
; SIVI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]]
+ ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; SIVI-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY3]], [[C]](s64)
; SIVI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; SIVI-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32)
; SIVI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]]
; SIVI-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]]
@@ -379,17 +379,17 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>)
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32)
; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1
; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]]
+ ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64)
- ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32)
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]]
; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]]
@@ -506,19 +506,19 @@ body: |
; SIVI-NEXT: {{ $}}
; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; SIVI-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4)
; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68
; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64)
; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4)
- ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32)
; SIVI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
;
; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0
; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
+ ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base
; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64)
- ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5)
; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0)
%0:_(p5) = G_FRAME_INDEX %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
index aebda3f..cbd9c21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
@@ -2,6 +2,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX67 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GCN,GFX12 %s
---
name: s_buffer_load_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
index db11855..45714fd9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smax.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_smax_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_SMAX %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMAX]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_SMAX %0, %1
@@ -115,6 +132,17 @@ body: |
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -165,6 +193,19 @@ body: |
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -209,6 +250,16 @@ body: |
; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smax_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -259,6 +310,18 @@ body: |
; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_SMAX %0, %1
@@ -309,6 +372,19 @@ body: |
; GFX9-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[SMAX2:%[0-9]+]]:_(s32) = G_SMAX [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMAX]](s32), [[SMAX1]](s32), [[SMAX2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_SMAX %0, %1
@@ -375,6 +451,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMAX]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_SMAX %0, %1
@@ -461,6 +545,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(s16) = G_SMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMAX]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMAX1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_SMAX %0, %1
@@ -568,6 +672,18 @@ body: |
; GFX9-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smax_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[SMAX:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMAX1:%[0-9]+]]:_(<2 x s16>) = G_SMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMAX]](<2 x s16>), [[SMAX1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_SMAX %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
index d366242..88fe5d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smin.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_smin_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_SMIN %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[SMIN]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_SMIN %0, %1
@@ -115,6 +132,17 @@ body: |
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -165,6 +193,19 @@ body: |
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -209,6 +250,16 @@ body: |
; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_smin_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 17
+ ; GFX1250-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 17
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -259,6 +310,18 @@ body: |
; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_SMIN %0, %1
@@ -309,6 +372,19 @@ body: |
; GFX9-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[SMIN2:%[0-9]+]]:_(s32) = G_SMIN [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SMIN]](s32), [[SMIN1]](s32), [[SMIN2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_SMIN %0, %1
@@ -375,6 +451,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[SMIN]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_SMIN %0, %1
@@ -461,6 +545,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(s16) = G_SMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[SMIN]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SMIN1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_SMIN %0, %1
@@ -568,6 +672,18 @@ body: |
; GFX9-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_smin_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[SMIN:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[SMIN1:%[0-9]+]]:_(<2 x s16>) = G_SMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[SMIN]](<2 x s16>), [[SMIN1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_SMIN %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
index 2b84c6b..acbcb098 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
@@ -886,33 +886,34 @@ body: |
; SI-NEXT: {{ $}}
; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+ ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32)
; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]]
+ ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]]
; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C4]](s32)
; SI-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C6]](s64)
- ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+ ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
- ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
- ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
+ ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+ ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32)
; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
- ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+ ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C5]]
- ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
+ ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
@@ -922,11 +923,12 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1)
; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, align 1, addrspace 1)
@@ -936,22 +938,23 @@ body: |
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+ ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32)
; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
- ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+ ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s64)
; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C4]](s16)
; VI-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C5]](s64)
- ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+ ; VI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
@@ -960,11 +963,11 @@ body: |
; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16)
; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
- ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
- ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
- ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C4]](s16)
+ ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64)
+ ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+ ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C4]](s16)
; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C5]](s64)
- ; VI-NEXT: G_STORE [[TRUNC3]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
+ ; VI-NEXT: G_STORE [[TRUNC4]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16)
; VI-NEXT: G_STORE [[ANYEXT2]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
;
@@ -973,11 +976,12 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
; GFX9-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1)
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; GFX9-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, align 1, addrspace 1)
@@ -998,17 +1002,18 @@ body: |
; SI-NEXT: {{ $}}
; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+ ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
+ ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32)
; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
- ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+ ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
@@ -1018,11 +1023,12 @@ body: |
; CI-NEXT: {{ $}}
; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1)
; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
@@ -1032,17 +1038,18 @@ body: |
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+ ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+ ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
+ ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32)
; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
- ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+ ; VI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; VI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
@@ -1052,11 +1059,12 @@ body: |
; GFX9-NEXT: {{ $}}
; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
- ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32)
+ ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
+ ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
; GFX9-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1)
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
; GFX9-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
index a931c63..7fd2319 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
@@ -285,13 +285,13 @@ body: |
; VI-NEXT: {{ $}}
; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
- ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
- ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+ ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+ ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
- ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C]](s16)
+ ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C]](s16)
; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; VI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+ ; VI-NEXT: G_STORE [[TRUNC1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
%0:_(p1) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
index e8fa4e9..32b526e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_umax_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_UMAX %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s64) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMAX]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_UMAX %0, %1
@@ -116,6 +133,17 @@ body: |
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -169,6 +197,20 @@ body: |
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -216,6 +258,17 @@ body: |
; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umax_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -266,6 +319,18 @@ body: |
; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_UMAX %0, %1
@@ -316,6 +381,19 @@ body: |
; GFX9-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_UMAX %0, %1
@@ -378,6 +456,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMAX]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_UMAX %0, %1
@@ -463,6 +549,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMAX]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_UMAX %0, %1
@@ -562,6 +668,18 @@ body: |
; GFX9-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umax_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[UMAX:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMAX1:%[0-9]+]]:_(<2 x s16>) = G_UMAX [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMAX]](<2 x s16>), [[UMAX1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_UMAX %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
index 8ee0df5..8666c29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
@@ -4,6 +4,7 @@
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX1250 %s
---
name: test_umin_s32
@@ -34,6 +35,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s32
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = G_UMIN %0, %1
@@ -72,6 +81,14 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s64
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[UMIN]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = G_UMIN %0, %1
@@ -116,6 +133,17 @@ body: |
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -169,6 +197,20 @@ body: |
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]]
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s8
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]]
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s8) = G_TRUNC %0
@@ -216,6 +258,17 @@ body: |
; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+ ;
+ ; GFX1250-LABEL: name: test_umin_s17
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 131071
+ ; GFX1250-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; GFX1250-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s17) = G_TRUNC %0
@@ -266,6 +319,18 @@ body: |
; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v2s32
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
%2:_(<2 x s32>) = G_UMIN %0, %1
@@ -316,6 +381,19 @@ body: |
; GFX9-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v3s32
+ ; GFX1250: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
+ ; GFX1250-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[UV]], [[UV3]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV4]]
+ ; GFX1250-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
+ ; GFX1250-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
%1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5
%2:_(<3 x s32>) = G_UMIN %0, %1
@@ -378,6 +456,14 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
; GFX9-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]]
; GFX9-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v2s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[COPY]], [[COPY1]]
+ ; GFX1250-NEXT: $vgpr0 = COPY [[UMIN]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_UMIN %0, %1
@@ -463,6 +549,26 @@ body: |
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v3s16
+ ; GFX1250: liveins: $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+ ; GFX1250-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; GFX1250-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
+ ; GFX1250-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+ ; GFX1250-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
+ ; GFX1250-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UMIN]](<2 x s16>)
+ ; GFX1250-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+ ; GFX1250-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN1]](s16)
+ ; GFX1250-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[BITCAST2]](s32), [[LSHR]](s32), [[ANYEXT]](s32)
+ ; GFX1250-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_UMIN %0, %1
@@ -562,6 +668,18 @@ body: |
; GFX9-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]]
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
+ ; GFX1250-LABEL: name: test_umin_v4s16
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
+ ; GFX1250-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; GFX1250-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
+ ; GFX1250-NEXT: [[UMIN:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV]], [[UV2]]
+ ; GFX1250-NEXT: [[UMIN1:%[0-9]+]]:_(<2 x s16>) = G_UMIN [[UV1]], [[UV3]]
+ ; GFX1250-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UMIN]](<2 x s16>), [[UMIN1]](<2 x s16>)
+ ; GFX1250-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_UMIN %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 7916267..800df89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -2,6 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
declare i16 @llvm.abs.i16(i16, i1)
declare i32 @llvm.abs.i32(i32, i1)
@@ -13,11 +14,30 @@ declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i16:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i16 s0, s0
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
@@ -32,14 +52,42 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
}
define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i64:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_ashr_i32 s2, s1, 31
-; GFX-NEXT: s_add_u32 s0, s0, s2
-; GFX-NEXT: s_mov_b32 s3, s2
-; GFX-NEXT: s_addc_u32 s1, s1, s2
-; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX-NEXT: ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s2, s1, 31
+; GFX6-NEXT: s_add_u32 s0, s0, s2
+; GFX6-NEXT: s_mov_b32 s3, s2
+; GFX6-NEXT: s_addc_u32 s1, s1, s2
+; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NEXT: s_add_u32 s0, s0, s2
+; GFX8-NEXT: s_mov_b32 s3, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, s2
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_ashr_i32 s2, s1, 31
+; GFX10-NEXT: s_add_u32 s0, s0, s2
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_addc_u32 s1, s1, s2
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mov_b32 s3, s2
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
@@ -78,6 +126,14 @@ define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i16 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
@@ -103,6 +159,14 @@ define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
@@ -140,6 +204,20 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
@@ -192,6 +270,24 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
+; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
@@ -243,6 +339,21 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v2i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
@@ -307,6 +418,27 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v3i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
+; GFX1250-NEXT: v_max_i16 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i16 v1, v1, v4
+; GFX1250-NEXT: v_max_i16 v2, v2, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -341,6 +473,16 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
; GFX10-NEXT: s_abs_i32 s0, s0
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s1, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -375,6 +517,14 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -416,6 +566,17 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX10-NEXT: s_abs_i32 s1, s1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s2, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s2, s2
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sext_i32_i16 s1, s1
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
@@ -460,6 +621,18 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_vgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-NEXT: ; return to shader part epilog
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
index 85c1d3a..390f62d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-LABEL: test_wave32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
index ce8cba2..67a388e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-LABEL: test_wave32:
@@ -10,9 +10,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX10-NEXT: s_load_dword s1, s[8:9], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s0
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
+; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: global_store_dword v[0:1], v0, off
@@ -26,9 +25,8 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
-; GFX11-NEXT: s_cselect_b32 s0, 1, 0
-; GFX11-NEXT: s_and_b32 s0, 1, s0
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0
+; GFX11-NEXT: s_and_b32 s0, exec_lo, s0
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
index 8d9f9d1..4687b83 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.ptr.buffer.ll
@@ -1,4 +1,4 @@
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck -check-prefix=GCN %s
; FIXME: Dropped parts from original test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
index 62f8f89..79a9291 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
; Natural mapping
define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
@@ -99,26 +100,47 @@ define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vg
; GFX8-NEXT: $vgpr1 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN_RTN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %cast
@@ -142,22 +164,39 @@ define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgp
; GFX8-NEXT: BUFFER_ATOMIC_ADD_X2_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_OFFEN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -217,58 +256,111 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -328,57 +420,109 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -400,21 +544,40 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_OFFEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index 364ed62..9f1b7a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck --check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck --check-prefix=GFX1250 %s
; Natural mapping
@@ -24,24 +25,43 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -66,22 +86,39 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -145,62 +182,119 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY15]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY15]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY15]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY15]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -263,60 +357,115 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN [[REG_SEQUENCE2]], [[COPY10]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -341,24 +490,46 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN [[REG_SEQUENCE1]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_OFFEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
@@ -395,33 +566,61 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -450,26 +649,47 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -542,71 +762,137 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
- ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+ ; GFX1200-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+ ; GFX1250-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -673,64 +959,123 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -764,33 +1109,64 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%voffset = add i32 %voffset.base, 4095
%ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
index 46ca43b..7003bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
@@ -124,52 +125,99 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -226,55 +274,105 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -509,23 +607,41 @@ define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr1 = COPY [[COPY7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -551,25 +667,45 @@ define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr2 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFEN]].sub2
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -597,27 +733,49 @@ define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sg
; GFX8-NEXT: $vgpr3 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr3 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr3 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub2
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFEN]].sub3
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr3 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -715,23 +873,41 @@ define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgp
; GFX8-NEXT: $vgpr1 = COPY [[COPY7]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY6]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY7]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub0
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY6]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY7]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <4 x half> %val
}
@@ -929,52 +1105,99 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret half %val
}
@@ -1028,52 +1251,99 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
%zext = zext i8 %val to i32
%cast = bitcast i32 %zext to float
@@ -1194,20 +1464,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 16
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1229,20 +1517,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1267,20 +1573,38 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4096, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4096, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4096
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1522,54 +1846,103 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
- ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1200-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset_add5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%soffset = add i32 %soffset.base, 5000
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -1627,52 +2000,102 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFEN [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 5000
%val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
index 3fbfb63..4784ac5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -5,7 +5,8 @@
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1200
+; RUN: llc -global-isel -mcpu=gfx1250 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1250
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
@@ -110,27 +111,49 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i8, i32 } %res, 0
store i8 %data, ptr addrspace(1) %data_addr
@@ -242,27 +265,49 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i16, i32 } %res, 0
store i16 %data, ptr addrspace(1) %data_addr
@@ -374,27 +419,49 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { half, i32 } %res, 0
store half %data, ptr addrspace(1) %data_addr
@@ -506,27 +573,49 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { i32, i32 } %res, 0
store i32 %data, ptr addrspace(1) %data_addr
@@ -646,29 +735,53 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x i32>, i32 } %res, 0
store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -788,29 +901,53 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x float>, i32 } %res, 0
store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -977,30 +1114,55 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x i32>, i32 } %res, 0
store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -1167,30 +1329,55 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x float>, i32 } %res, 0
store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -1318,31 +1505,57 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %res, 0
store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -1470,31 +1683,57 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub0
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub2
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub3
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_OFFSET]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY11]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x float>, i32 } %res, 0
store <4 x float> %data, ptr addrspace(1) %data_addr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 63ca7be..c365d57 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
; FIXME: Test with SI when argument lowering not broken for f16
; Natural mapping
@@ -126,52 +127,99 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -309,55 +357,105 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -618,22 +716,39 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -657,23 +772,41 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v3f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -698,24 +831,43 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f32
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -876,22 +1028,39 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -946,54 +1115,103 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE2]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -1080,20 +1298,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 16
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1115,20 +1351,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1153,20 +1407,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f32_voffset_add_4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1256,20 +1528,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_16
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 16
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1291,20 +1581,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4095
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1329,20 +1637,38 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4096, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset_add_4096
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 4096
call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1400,52 +1726,102 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE1]], [[COPY6]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_offset_add_5000
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact [[COPY4]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE1]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_1]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%voffset.add = add i32 %voffset, 5000
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -1501,51 +1877,97 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_offset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE1]], [[COPY5]], 5000, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 5000, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
index 75d6c59..484639a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -22,23 +23,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -63,23 +82,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -109,28 +146,51 @@ define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc_
; GFX8-NEXT: $vgpr1 = COPY [[COPY10]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY10]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN_RTN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to <2 x float>
ret <2 x float> %cast
@@ -156,24 +216,43 @@ define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__
; GFX8-NEXT: BUFFER_ATOMIC_ADD_X2_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_X2_VBUFFER_BOTHEN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -236,61 +315,117 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -353,60 +488,115 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN [[COPY8]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -431,23 +621,41 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 3, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_VBUFFER_BOTHEN_RTN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
%cast = bitcast i32 %ret to float
ret float %cast
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index c9d1227..7dab257 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -25,26 +26,47 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
; GFX8-NEXT: $vgpr0 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -71,24 +93,43 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cm
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -155,65 +196,125 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg
; GFX8-NEXT: $vgpr0 = COPY [[COPY17]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY17]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY17]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY17]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
ret float %cast
@@ -279,63 +380,121 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY12]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -362,26 +521,50 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
; GFX8-NEXT: $vgpr0 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY9]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_VBUFFER_BOTHEN_RTN]].sub0
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%ret = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %val, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i32 %ret to float
@@ -420,35 +603,65 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -479,28 +692,51 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cm
; GFX8-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -576,74 +812,143 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
- ; GFX12-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+ ; GFX1200-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY19:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+ ; GFX1250-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
ret double %cast
@@ -713,67 +1018,129 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cm
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
- ; GFX12-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
- ; GFX12-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -809,35 +1176,68 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
; GFX8-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
;
- ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
- ; GFX12-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
- ; GFX12-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
- ; GFX12-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ; GFX1200-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1200-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+ ; GFX1200-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1200-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY9]], [[COPY11]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+ ; GFX1250-NEXT: [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128_align2 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub1
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+ ; GFX1250-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%voffset = add i32 %voffset.base, 4095
%ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index 9b5e46b3..dbef90f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -21,22 +22,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -63,25 +81,45 @@ define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr1 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x float> %val
}
@@ -110,27 +148,49 @@ define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr2 = COPY [[COPY9]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
;
- ; GFX12-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY9]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ; GFX1200-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub2
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%val = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <3 x float> %val
}
@@ -161,29 +221,53 @@ define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__
; GFX8-NEXT: $vgpr3 = COPY [[COPY10]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
;
- ; GFX12-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: $vgpr2 = COPY [[COPY9]]
- ; GFX12-NEXT: $vgpr3 = COPY [[COPY10]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ; GFX1200-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1200-NEXT: $vgpr3 = COPY [[COPY10]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub3
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: $vgpr2 = COPY [[COPY9]]
+ ; GFX1250-NEXT: $vgpr3 = COPY [[COPY10]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%val = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <4 x float> %val
}
@@ -208,23 +292,41 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -248,22 +350,42 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_ADD_U32_e64_]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%voffset = add i32 %voffset.base, 4095
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
@@ -287,22 +409,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_soffset_64
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0)
ret float %val
}
@@ -363,59 +502,113 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %val
}
@@ -438,22 +631,39 @@ define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = zext i8 %val to i32
%cast = bitcast i32 %ext to float
@@ -478,22 +688,39 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = sext i8 %val to i32
%cast = bitcast i32 %ext to float
@@ -519,23 +746,41 @@ define amdgpu_ps float @struct_buffer_load_i8_sext_wrong_width(<4 x i32> inreg %
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i8_sext_wrong_width
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
+ ; GFX1200-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i8_sext_wrong_width
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_VBUFFER_BOTHEN]], 0, 4, implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i8 %val to i4
%ext = sext i4 %trunc to i32
@@ -561,22 +806,39 @@ define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgp
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = zext i16 %val to i32
%cast = bitcast i32 %ext to float
@@ -601,22 +863,39 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%ext = sext i16 %val to i32
%cast = bitcast i32 %ext to float
@@ -642,23 +921,41 @@ define amdgpu_ps float @struct_buffer_load_i16_sext_wrong_width(<4 x i32> inreg
; GFX8-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_i16_sext_wrong_width
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
+ ; GFX1200-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_i16_sext_wrong_width
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]], 0, 8, implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
%trunc = trunc i16 %val to i8
%ext = sext i8 %trunc to i32
@@ -685,22 +982,39 @@ define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voff
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret half %val
}
@@ -724,22 +1038,39 @@ define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__v
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %val
}
@@ -772,25 +1103,45 @@ define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__v
; GFX8-NEXT: $vgpr1 = COPY [[COPY8]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
;
- ; GFX12-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
- ; GFX12-NEXT: $vgpr0 = COPY [[COPY7]]
- ; GFX12-NEXT: $vgpr1 = COPY [[COPY8]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ; GFX1200-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1200-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1200-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ ; GFX1250-NEXT: $vgpr0 = COPY [[COPY7]]
+ ; GFX1250-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
%val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <4 x half> %val
}
@@ -814,22 +1165,39 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
;
- ; GFX12-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
- ; GFX12-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ; GFX1200-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1200-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN]]
+ ; GFX1250-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)
ret float %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
index 674fe1c..39cce20 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -5,7 +5,8 @@
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1010 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefix=GFX910
; RUN: llc -global-isel -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX11
-; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX12
+; RUN: llc -global-isel -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1200
+; RUN: llc -global-isel -mcpu=gfx1250 -mattr=-real-true16 -mtriple=amdgcn-- -stop-after=instruction-select < %s | FileCheck %s -check-prefixes=GFX1250
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: name: raw_buffer_load_i8_tfe
@@ -114,29 +115,53 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i8_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i8_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_UBYTE_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_BYTE [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s8) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i8, i32 } %res, 0
store i8 %data, ptr addrspace(1) %data_addr
@@ -252,29 +277,53 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i16, i32 } %res, 0
store i16 %data, ptr addrspace(1) %data_addr
@@ -390,29 +439,53 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_f16_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_f16_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_USHORT_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_SHORT [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s16) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { half, i32 } %res, 0
store half %data, ptr addrspace(1) %data_addr
@@ -528,29 +601,53 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORD_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE1]], [[COPY9]], 0, 0, implicit $exec :: (store (s32) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i32, i32 } %res, 0
store i32 %data, ptr addrspace(1) %data_addr
@@ -674,31 +771,57 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x i32>, i32 } %res, 0
store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -822,31 +945,57 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v2f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v2f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX2 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<2 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY11]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x float>, i32 } %res, 0
store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -1018,32 +1167,59 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x i32>, i32 } %res, 0
store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -1215,32 +1391,59 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v3f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v3f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX3 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<3 x s32>) into %ir.data_addr, align 16, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY12]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x float>, i32 } %res, 0
store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -1372,33 +1575,61 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4i32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4i32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %res, 0
store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -1530,33 +1761,61 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
; GFX11-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
; GFX11-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: raw_buffer_load_v4f32_tfe
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
- ; GFX12-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
- ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1200-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: raw_buffer_load_v4f32_tfe
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; GFX1250-NEXT: [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN:%[0-9]+]]:vreg_160_align2 = BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN [[COPY8]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub0
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub1
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub2
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub3
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_TFE_VBUFFER_IDXEN]].sub4
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY12]], %subreg.sub3
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORDX4 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (<4 x s32>) into %ir.data_addr, addrspace 1)
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE2]], [[COPY13]], 0, 0, implicit $exec :: (store (s32) into %ir.tfe_addr, addrspace 1)
+ ; GFX1250-NEXT: S_ENDPGM 0
%res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x float>, i32 } %res, 0
store <4 x float> %data, ptr addrspace(1) %data_addr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index 8183d85..c9771b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
; Natural mapping
define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
@@ -21,22 +22,39 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -61,24 +79,43 @@ define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -104,25 +141,45 @@ define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -149,26 +206,47 @@ define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -233,64 +311,123 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde
; GFX8-NEXT: bb.5:
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: successors: %bb.2(0x80000000)
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
- ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
- ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
- ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.2:
- ; GFX12-NEXT: successors: %bb.3(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
- ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
- ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
- ; GFX12-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
- ; GFX12-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
- ; GFX12-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
- ; GFX12-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
- ; GFX12-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.3:
- ; GFX12-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
- ; GFX12-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
- ; GFX12-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.4:
- ; GFX12-NEXT: successors: %bb.5(0x80000000)
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: bb.5:
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1200-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1200-NEXT: [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+ ; GFX1200-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1200-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1200-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.2:
+ ; GFX1200-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1200-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1200-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1200-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1200-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.3:
+ ; GFX1200-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1200-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1200-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.4:
+ ; GFX1200-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: bb.5:
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vindex__sgpr_voffset__vgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: successors: %bb.2(0x80000000)
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+ ; GFX1250-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX1250-NEXT: [[COPY11:%[0-9]+]]:vreg_128_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX1250-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+ ; GFX1250-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+ ; GFX1250-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.2:
+ ; GFX1250-NEXT: successors: %bb.3(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY14:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]].sub2_sub3
+ ; GFX1250-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+ ; GFX1250-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+ ; GFX1250-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GFX1250-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.3:
+ ; GFX1250-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>), align 1, addrspace 8)
+ ; GFX1250-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+ ; GFX1250-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.4:
+ ; GFX1250-NEXT: successors: %bb.5(0x80000000)
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: bb.5:
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -313,22 +450,39 @@ define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__
; GFX8-NEXT: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_BYTE_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s8), addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%val.trunc = trunc i32 %val to i8
call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -352,22 +506,39 @@ define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_SHORT_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (s16), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
%val.trunc = trunc i32 %val to i16
call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -391,22 +562,39 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset_glc
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1)
ret void
}
@@ -429,22 +617,39 @@ define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -475,24 +680,43 @@ define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vinde
; GFX8-NEXT: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
;
- ; GFX12-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
- ; GFX12: bb.1 (%ir-block.0):
- ; GFX12-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
- ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
- ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
- ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
- ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
- ; GFX12-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
- ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
- ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GFX1200-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1200: bb.1 (%ir-block.0):
+ ; GFX1200-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1200-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1200-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1200-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1200-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1200-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1200-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1200-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1200-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1200-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1200-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1200-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1200-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset
+ ; GFX1250: bb.1 (%ir-block.0):
+ ; GFX1250-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX1250-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX1250-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX1250-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1250-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
+ ; GFX1250-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
+ ; GFX1250-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+ ; GFX1250-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+ ; GFX1250-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX1250-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX1250-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+ ; GFX1250-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX1250-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s16>), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
index 7c0484b..1ba2558 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb1043..ab8d8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
@@ -64,6 +66,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
+; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,6 +304,34 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,16 +422,35 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
@@ -392,16 +487,35 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
}
define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_i96_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_i96_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
; GFX9: ; %bb.0:
@@ -438,16 +552,35 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
@@ -484,16 +617,35 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
}
define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v6i16_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v6i16_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
@@ -539,28 +691,67 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
}
define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v12i8_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
-; GFX12-NEXT: v_mov_b32_e32 v8, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v12
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-UNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-NOUNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v12i8_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v13, 8, v0 :: v_dual_lshrrev_b32 v12, 16, v0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 24, v0 :: v_dual_lshrrev_b32 v5, 8, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v6, 16, v1 :: v_dual_lshrrev_b32 v7, 24, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 8, v2 :: v_dual_lshrrev_b32 v10, 16, v2
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_lshrrev_b32 v11, 24, v2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v1, v13
+; GFX1250-NEXT: v_mov_b32_e32 v2, v12
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
@@ -632,16 +823,35 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
; GFX9: ; %bb.0:
@@ -720,6 +930,53 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s5, s[0:1], 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s6, s[0:1], 0x7
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s7, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s8, s[0:1], 0x9
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s9, s[0:1], 0xb
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
@@ -916,6 +1173,34 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s4, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s5
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae989..637aaf7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
@@ -2854,89 +2854,90 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
+; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
-; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
+; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
-; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, v15
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
-; GFX1250-NEXT: v_mov_b32_e32 v0, v16
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
+; GFX1250-NEXT: v_mov_b32_e32 v1, v14
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir
index 73e06de..8d72d11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir
@@ -14,9 +14,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
- ; CHECK-NEXT: %k:_(s64) = G_CONSTANT i64 4294967295
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], %k
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+ ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LOAD]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
%0:_(p1) = COPY $vgpr0_vgpr1
%1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
%k:_(s64) = G_CONSTANT i64 4294967295
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
new file mode 100644
index 0000000..75f0061
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
@@ -0,0 +1,77 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1200 -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
+
+# COM: Check that the pass doesn't crash.
+
+---
+name: test_inline_asm
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+ dx10-clamp: true
+body: |
+ bb.1 :
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_inline_asm
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %5(s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ %2:vgpr(s32) = COPY %1(s32)
+ %3:vgpr(s32) = G_FMUL %0, %2
+ %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %5:vgpr_32
+ %6:vgpr(s32) = COPY %4(s32)
+ %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32)
+ $vgpr0 = COPY %7(s32)
+...
+
+---
+name: test_unmerge_values
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+ mode:
+ ieee: true
+ dx10-clamp: true
+body: |
+ bb.1 :
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: test_unmerge_values
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], [[C2]], [[COPY2]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[C2]](s32)
+ %0:vgpr(s32) = COPY $vgpr0
+ %1:sgpr(s32) = G_FCONSTANT float 2.000000e+00
+ %2:vgpr(s32) = COPY %1(s32)
+ %3:vgpr(s32) = G_FMUL %0, %2
+ %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
+ %5:vgpr(s64) = G_CONSTANT i64 123456789
+ %6:vgpr(s32), %7:vgpr(s32) = G_UNMERGE_VALUES %5(s64)
+ %8:vgpr(s32) = COPY %4(s32)
+ %9:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %7(s32), %8(s32)
+ $vgpr0 = COPY %7(s32)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir
index 45332c2..4fdc8e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.s32.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: add_s32_ss
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
index 04cdf2e..17fc2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.readfirstlane.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s
---
name: readfirstlane_s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index 89681e7..c82f7c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -2,6 +2,7 @@
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -check-prefix=GFX7
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -simplify-mir -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s -check-prefix=GFX7
; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -amdgpu-global-isel-new-legality -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -simplify-mir -stop-after=regbankselect -o - %s | FileCheck %s -check-prefix=GFX12
; Natural mapping
define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.getpc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.getpc.mir
index 9650da8..cba0db7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.getpc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.getpc.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s
---
name: getpc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitcast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitcast.mir
index 550f042..66bdf41 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitcast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-bitcast.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: bitcast_s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-brcond.mir
index 3b2b141..cd957c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-brcond.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: brcond_vcc_cond
@@ -40,9 +39,9 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C]]
+ ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
bb.0.entry:
@@ -66,9 +65,9 @@ body: |
; CHECK-NEXT: liveins: $sgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[ZEXT]](s32), %bb.1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
bb.0.entry:
@@ -91,9 +90,11 @@ body: |
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[COPY1]](s1), %bb.1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+ ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
bb.0.entry:
@@ -120,13 +121,15 @@ body: |
; CHECK-NEXT: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[COPY1]](s1), %bb.1
+ ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
bb.0.entry:
@@ -157,9 +160,11 @@ body: |
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[COPY1]](s1), %bb.1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
+ ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
bb.0.entry:
@@ -189,10 +194,12 @@ body: |
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]]
; CHECK-NEXT: S_NOP 0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
- ; CHECK-NEXT: G_BRCOND [[COPY1]](s1), %bb.1
+ ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
bb.0.entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.mir
index 0dc1165..ef2477c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-build-vector.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" -o - %s | FileCheck %s
---
name: build_vector_v2s32_ss
@@ -88,9 +87,9 @@ body: |
; CHECK-LABEL: name: build_vector_v2s32_aa
; CHECK: liveins: $agpr0, $agpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:agpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr1
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $agpr1
@@ -111,9 +110,8 @@ body: |
; CHECK: liveins: $vgpr0, $agpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY2]](s32)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $agpr0
@@ -133,10 +131,9 @@ body: |
; CHECK-LABEL: name: build_vector_v2s32_av
; CHECK: liveins: $vgpr0, $agpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $vgpr0
@@ -157,10 +154,9 @@ body: |
; CHECK: liveins: $sgpr0, $agpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr0
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $agpr0
@@ -180,11 +176,10 @@ body: |
; CHECK-LABEL: name: build_vector_v2s32_as
; CHECK: liveins: $sgpr0, $agpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY2]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $sgpr0
@@ -204,10 +199,10 @@ body: |
; CHECK-LABEL: name: build_vector_v3s32_aaa
; CHECK: liveins: $agpr0, $agpr1, $agpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr(s32) = COPY $agpr2
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:agpr(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $agpr2
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $agpr1
@@ -228,11 +223,11 @@ body: |
; CHECK-LABEL: name: build_vector_v4s32_aaaa
; CHECK: liveins: $agpr0, $agpr1, $agpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr(s32) = COPY $agpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr(s32) = COPY $agpr2
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:agpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $agpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $agpr2
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<4 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $agpr1
@@ -254,15 +249,15 @@ body: |
; CHECK-LABEL: name: build_vector_v8s32_aaaaaaaa
; CHECK: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr(s32) = COPY $agpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr(s32) = COPY $agpr3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:agpr(s32) = COPY $agpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:agpr(s32) = COPY $agpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:agpr(s32) = COPY $agpr6
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:agpr(s32) = COPY $agpr7
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:agpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $agpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $agpr3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $agpr4
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $agpr5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $agpr6
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $agpr7
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<8 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $agpr1
@@ -288,23 +283,23 @@ body: |
; CHECK-LABEL: name: build_vector_v16s32_aaaaaaaaaaaaaaaa
; CHECK: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:agpr(s32) = COPY $agpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr(s32) = COPY $agpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr(s32) = COPY $agpr2
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr(s32) = COPY $agpr3
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:agpr(s32) = COPY $agpr4
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:agpr(s32) = COPY $agpr5
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:agpr(s32) = COPY $agpr6
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:agpr(s32) = COPY $agpr7
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:agpr(s32) = COPY $agpr8
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:agpr(s32) = COPY $agpr9
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:agpr(s32) = COPY $agpr10
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:agpr(s32) = COPY $agpr11
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:agpr(s32) = COPY $agpr12
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:agpr(s32) = COPY $agpr13
- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:agpr(s32) = COPY $agpr14
- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:agpr(s32) = COPY $agpr15
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:agpr(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $agpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $agpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $agpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $agpr3
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $agpr4
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $agpr5
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $agpr6
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $agpr7
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $agpr8
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $agpr9
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $agpr10
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $agpr11
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $agpr12
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY $agpr13
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY $agpr14
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY $agpr15
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<16 x s32>)
%0:_(s32) = COPY $agpr0
%1:_(s32) = COPY $agpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir
index bd69995..456e0c13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-default.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass=regbankselect %s -o - | FileCheck %s
+# RUN: llc -O0 -march amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
# Check the default mappings for various instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir
index 4fba303..8944fd7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: fadd_ss
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
index 2b60dcd..62bf14b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: test_frame_index_p5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 1b64099..e448c4c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s -check-prefixes=GCN,GFX12
--- |
define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
index 55048d5..57b7a82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
@@ -1,6 +1,102 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
+
+---
+name: gep_p0_s_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: gep_p0_s_k
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+ %0:_(p0) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 1
+ %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_s_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: gep_p0_s_s
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p0) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p0) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: gep_p0_v_k
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p0) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 1
+ %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: gep_p0_v_s
+ ; CHECK: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY2]](s64)
+ %0:_(p0) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $sgpr0_sgpr1
+ %2:_(p0) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p0_v_v
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: gep_p0_v_v
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p0) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(p0) = G_PTR_ADD %0, %1
+...
---
name: gep_p1_s_k
@@ -98,3 +194,294 @@ body: |
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(p1) = G_PTR_ADD %0, %1
...
+
+---
+name: gep_p3_s_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: gep_p3_s_k
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+ %0:_(p3) = COPY $sgpr0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_s_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: gep_p3_s_s
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p3) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: gep_p3_v_k
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p3) = COPY $vgpr0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ ; CHECK-LABEL: name: gep_p3_v_s
+ ; CHECK: liveins: $vgpr0, $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY2]](s32)
+ %0:_(p3) = COPY $vgpr0
+ %1:_(s32) = COPY $sgpr0
+ %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p3_v_v
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: gep_p3_v_v
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p3) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p3) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_s_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: gep_p4_s_k
+ ; CHECK: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(s64) = G_CONSTANT i64 1
+ %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_s_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+ ; CHECK-LABEL: name: gep_p4_s_s
+ ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p4) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: gep_p4_v_k
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[C]](s64)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p4) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 1
+ %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: gep_p4_v_s
+ ; CHECK: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY2]](s64)
+ %0:_(p4) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $sgpr0_sgpr1
+ %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p4_v_v
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: gep_p4_v_v
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+ %0:_(p4) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(p4) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_s_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: gep_p5_s_k
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+ %0:_(p5) = COPY $sgpr0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_s_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: gep_p5_s_s
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p5) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_k
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: gep_p5_v_k
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p5) = COPY $vgpr0
+ %1:_(s32) = G_CONSTANT i32 1
+ %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_s
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+
+ ; CHECK-LABEL: name: gep_p5_v_s
+ ; CHECK: liveins: $vgpr0, $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY2]](s32)
+ %0:_(p5) = COPY $vgpr0
+ %1:_(s32) = COPY $sgpr0
+ %2:_(p5) = G_PTR_ADD %0, %1
+...
+
+---
+name: gep_p5_v_v
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: gep_p5_v_v
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p5) = G_PTR_ADD [[COPY]], [[COPY1]](s32)
+ %0:_(p5) = COPY $vgpr0
+ %1:_(s32) = COPY $vgpr1
+ %2:_(p5) = G_PTR_ADD %0, %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index 2177cd7..b2ff0995c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass=regbankselect %s -o - | FileCheck -check-prefix=GFX7 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass=regbankselect %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
--- |
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
index 4fcd0fd..b0199d3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: sub_s32_ss
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir
index 554c88a..e95be13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uitofp.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: uitofp_s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4031fe0..f57fc00 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -194,10 +194,8 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_sdiv_i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; CHECK-NEXT: s_mov_b32 s7, -1
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s0, 1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
@@ -218,7 +216,6 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s11
-; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9]
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
@@ -327,9 +324,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
+; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
+; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0
+; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
; CHECK-NEXT: s_branch .LBB1_3
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
index ad60a61..9f4a6f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 1a10f5f..8d8eca1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -188,12 +188,10 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_srem_i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
-; CHECK-NEXT: s_mov_b32 s7, -1
-; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
-; CHECK-NEXT: s_mov_b32 s7, 1
+; CHECK-NEXT: s_mov_b32 s0, 1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -212,7 +210,6 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s9
-; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
@@ -273,43 +270,43 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1
-; CHECK-NEXT: v_mul_hi_u32 v5, s10, v0
+; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT: v_mul_hi_u32 v6, s11, v1
+; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, s11, v1
+; CHECK-NEXT: v_mul_lo_u32 v4, s11, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, s10, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
+; CHECK-NEXT: v_mov_b32_e32 v5, s11
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v4, s9
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v3, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
+; CHECK-NEXT: v_mov_b32_e32 v3, s9
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v5, s[0:1]
-; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1]
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v3
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
@@ -322,11 +319,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
+; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: s_branch .LBB1_3
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
-; CHECK-NEXT: s_xor_b32 s0, s7, 1
+; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll
new file mode 100644
index 0000000..0aa08cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true -stop-after=legalizer -o - %s | FileCheck -check-prefix=UNPACKED %s
+
+define void @store_i48(ptr addrspace(1) %ptr, i48 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i48
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV1]](s64)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C]](s32)
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C1]](s64)
+ ; UNPACKED-NEXT: G_STORE [[COPY2]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i48 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i55(ptr addrspace(1) %ptr, i55 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i55
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36028797018963967
+ ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]]
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[AND]](s64)
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C1]](s32)
+ ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+ ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C3]](s32)
+ ; UNPACKED-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1)
+ ; UNPACKED-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 6, align 2, basealign 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i55 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i56(ptr addrspace(1) %ptr, i56 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i56
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV1]](s64)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C]](s32)
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C1]](s64)
+ ; UNPACKED-NEXT: G_STORE [[COPY2]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+ ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C2]](s32)
+ ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1)
+ ; UNPACKED-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 6, align 2, basealign 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i56 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i65(ptr addrspace(1) %ptr, i65 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i65
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[DEF]](s32)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]]
+ ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64)
+ ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s8) into %ir.ptr + 8, align 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i65 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i95(ptr addrspace(1) %ptr, i95 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i95
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[DEF]](s32)
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2147483647
+ ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]]
+ ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64)
+ ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s32) into %ir.ptr + 8, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i95 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i96(ptr addrspace(1) %ptr, i96 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i96
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32)
+ ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[MV1]](s96)
+ ; UNPACKED-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[MV]](p1) :: (store (<3 x s32>) into %ir.ptr, align 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i96 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i97(ptr addrspace(1) %ptr, i97 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i97
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]]
+ ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+ ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64)
+ ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1)
+ ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND1]], [[C3]](s32)
+ ; UNPACKED-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+ ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
+ ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s32) into %ir.ptr + 8, addrspace 1)
+ ; UNPACKED-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
+ ; UNPACKED-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 12, align 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i97 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+define void @store_i127(ptr addrspace(1) %ptr, i127 %arg) #0 {
+ ; UNPACKED-LABEL: name: store_i127
+ ; UNPACKED: bb.1 (%ir-block.0):
+ ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; UNPACKED-NEXT: {{ $}}
+ ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+ ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+ ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+ ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+ ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
+ ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+ ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+ ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]]
+ ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]]
+ ; UNPACKED-NEXT: [[MV3:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[AND]](s64), [[AND1]](s64)
+ ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[MV3]](s128)
+ ; UNPACKED-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[MV]](p1) :: (store (<4 x s32>) into %ir.ptr, align 4, addrspace 1)
+ ; UNPACKED-NEXT: SI_RETURN
+ store i127 %arg, ptr addrspace(1) %ptr, align 4
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
index c199923..b6c8f21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
define i16 @v_trunc_i32_to_i16(i32 %src) {
; GFX7-LABEL: v_trunc_i32_to_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 1a6d261..4de1078 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -187,11 +187,9 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_udiv_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
-; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: s_mov_b32 s7, -1
-; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT: s_mov_b32 s6, 1
+; CHECK-NEXT: s_mov_b32 s4, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
@@ -199,7 +197,6 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v2
; CHECK-NEXT: s_subb_u32 s5, 0, s3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -318,11 +315,12 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_branch .LBB1_3
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
-; CHECK-NEXT: s_xor_b32 s1, s6, 1
+; CHECK-NEXT: s_xor_b32 s1, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2a1bf4b..a41ec8e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -184,18 +184,15 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-LABEL: s_urem_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
-; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: s_mov_b32 s7, -1
-; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
-; CHECK-NEXT: s_mov_b32 s6, 1
+; CHECK-NEXT: s_mov_b32 s4, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
-; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v2
; CHECK-NEXT: s_subb_u32 s5, 0, s3
@@ -314,11 +311,12 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_branch .LBB1_3
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: .LBB1_3: ; %Flow
-; CHECK-NEXT: s_xor_b32 s1, s6, 1
+; CHECK-NEXT: s_xor_b32 s1, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
; CHECK-NEXT: ; %bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
index 5408ad0..abfb4fe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=hawaii < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=fiji < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx90a < %s | FileCheck --check-prefix=PREGFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select -mcpu=hawaii < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select -mcpu=fiji < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select -mcpu=gfx90a < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select -mcpu=gfx1030 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -new-reg-bank-select -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX10PLUS %s
define i32 @check_v_bfe(i16 %a) {
; PREGFX9-LABEL: check_v_bfe:
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll
new file mode 100644
index 0000000..e8f949d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomic-cmpxchg.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s
+
+define void @ds_atomic_cmpxchg_i32_ret_av_av__av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_av_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_cmpxchg_i32_ret_av_av__v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_av_av__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_cmpxchg_i32_ret_av_av__a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_av_av__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; FIXME: Broken
+; define void @ds_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(3) %ptr) #0 {
+; %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+; %data0 = call i32 asm "; def $0", "=a"()
+; %data1 = call i32 asm "; def $0", "=a"()
+; %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+; %result = extractvalue { i32, i1 } %pair, 0
+; call void asm "; use $0", "a"(i32 %result)
+; ret void
+; }
+
+; FIXME: Broken
+; define void @ds_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(3) %ptr) #0 {
+; %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+; %data0 = call i32 asm "; def $0", "=a"()
+; %data1 = call i32 asm "; def $0", "=a"()
+; %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+; %result = extractvalue { i32, i1 } %pair, 0
+; call void asm "; use $0", "v"(i32 %result)
+; ret void
+; }
+
+; FIXME: Broken
+; define void @ds_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(3) %ptr) #0 {
+; %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+; %data0 = call i32 asm "; def $0", "=v"()
+; %data1 = call i32 asm "; def $0", "=a"()
+; %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+; %result = extractvalue { i32, i1 } %pair, 0
+; call void asm "; use $0", "v"(i32 %result)
+; ret void
+; }
+
+; FIXME: Broken
+; define void @ds_atomic_cmpxchg_i32_ret_a_v__v(ptr addrspace(3) %ptr) #0 {
+; %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+; %data0 = call i32 asm "; def $0", "=a"()
+; %data1 = call i32 asm "; def $0", "=v"()
+; %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+; %result = extractvalue { i32, i1 } %pair, 0
+; call void asm "; use $0", "v"(i32 %result)
+; ret void
+; }
+
+define void @ds_atomic_cmpxchg_i32_ret_v_v__a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_v_v__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=v"()
+ %data1 = call i32 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_cmpxchg_i32_ret_av_v__av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_av_v__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_cmpxchg_i32_ret_v_av__av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_v_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=v"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; FIXME: Broken
+; define void @ds_atomic_cmpxchg_i32_ret_av_a__av(ptr addrspace(3) %ptr) #0 {
+; %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+; %data0 = call i32 asm "; def $0", "=^VA"()
+; %data1 = call i32 asm "; def $0", "=a"()
+; %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+; %result = extractvalue { i32, i1 } %pair, 0
+; call void asm "; use $0", "^VA"(i32 %result)
+; ret void
+; }
+
+define void @ds_atomic_cmpxchg_i32_ret_a_av__av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_cmpxchg_i32_ret_a_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v0, v2, v1 offset:40
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=a"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(3) %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
new file mode 100644
index 0000000..4c62409
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-ds-atomicrmw.ll
@@ -0,0 +1,1123 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; xchg i32 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @ds_atomic_xchg_i32_ret_a_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @ds_atomic_xchg_i32_ret_a_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @ds_atomic_xchg_i32_ret_v_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @ds_atomic_xchg_i32_ret_av_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @ds_atomic_xchg_i32_ret_av_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @ds_atomic_xchg_i32_ret_av_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @ds_atomic_xchg_i32_ret_a_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @ds_atomic_xchg_i32_ret_v_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_ret_av_av_no_agprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_xchg_i32_noret_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %unused = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ ret void
+}
+
+define void @ds_atomic_xchg_i32_noret_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i32_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xchg ptr addrspace(3) %ptr, i32 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xchg i64 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @ds_atomic_xchg_i64_ret_a_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @ds_atomic_xchg_i64_ret_a_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @ds_atomic_xchg_i64_ret_v_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @ds_atomic_xchg_i64_ret_av_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @ds_atomic_xchg_i64_ret_av_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @ds_atomic_xchg_i64_ret_av_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @ds_atomic_xchg_i64_ret_a_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @ds_atomic_xchg_i64_ret_v_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @ds_atomic_xchg_i64_noret_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %unused = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ ret void
+}
+
+define void @ds_atomic_xchg_i64_noret_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xchg_i64_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_wrxchg_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xchg ptr addrspace(3) %ptr, i64 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xor i32 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @ds_atomic_xor_i32_ret_a_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @ds_atomic_xor_i32_ret_a_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @ds_atomic_xor_i32_ret_v_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @ds_atomic_xor_i32_ret_av_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @ds_atomic_xor_i32_ret_av_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @ds_atomic_xor_i32_ret_av_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @ds_atomic_xor_i32_ret_a_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @ds_atomic_xor_i32_ret_v_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_ret_av_av_no_agprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_xor_rtn_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v18 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a19 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a18 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @ds_atomic_xor_i32_noret_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_b32 v0, a0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=a"()
+ %unused = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ ret void
+}
+
+define void @ds_atomic_xor_i32_noret_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i32_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_b32 v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xor ptr addrspace(3) %ptr, i32 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xor i64 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @ds_atomic_xor_i64_ret_a_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @ds_atomic_xor_i64_ret_a_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @ds_atomic_xor_i64_ret_v_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @ds_atomic_xor_i64_ret_av_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @ds_atomic_xor_i64_ret_av_v(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @ds_atomic_xor_i64_ret_av_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @ds_atomic_xor_i64_ret_a_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @ds_atomic_xor_i64_ret_v_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_rtn_b64 v[0:1], v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i32 0, i32 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @ds_atomic_xor_i64_noret_a(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_b64 v0, a[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %unused = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ ret void
+}
+
+define void @ds_atomic_xor_i64_noret_av(ptr addrspace(3) %ptr) #0 {
+; CHECK-LABEL: ds_atomic_xor_i64_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_xor_b64 v0, v[2:3]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(3) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xor ptr addrspace(3) %ptr, i64 %data seq_cst
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
new file mode 100644
index 0000000..58f3ffb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomic-cmpxchg.ll
@@ -0,0 +1,1062 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; i32 cases
+;---------------------------------------------------------------------
+
+define void @flat_atomic_cmpxchg_i32_ret_av_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_av_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_av_av__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_av_av__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_av_av__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_av_av__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_a_a__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_a_a__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=a"()
+ %data1 = call i32 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_a_a__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_a_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=a"()
+ %data1 = call i32 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_v_a__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_v_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=v"()
+ %data1 = call i32 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_a_v__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_a_v__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=a"()
+ %data1 = call i32 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_v_v__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_v_v__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=v"()
+ %data1 = call i32 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_av_v__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_av_v__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_v_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_v_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=v"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_av_a__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_av_a__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=^VA"()
+ %data1 = call i32 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i32_ret_a_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i32_ret_a_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] offset:40 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i32 0, i32 10
+ %data0 = call i32 asm "; def $0", "=a"()
+ %data1 = call i32 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i32 %data0, i32 %data1 seq_cst monotonic
+ %result = extractvalue { i32, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; i64 cases
+;---------------------------------------------------------------------
+
+define void @flat_atomic_cmpxchg_i64_ret_av_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB12_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB12_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB12_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB12_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_av_av__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB13_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB13_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB13_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB13_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_av_av__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_av__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB14_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB14_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB14_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: .LBB14_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_a_a__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB15_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB15_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB15_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: .LBB15_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_a_a__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB16_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB16_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB16_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB16_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_v_a__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB17_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB17_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB17_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB17_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_a_v__v(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_v__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB18_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB18_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB18_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB18_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_v_v__a(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_v__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB19_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB19_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB19_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: .LBB19_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_av_v__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_v__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB20_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB20_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB20_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB20_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_v_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_v_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB21_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB21_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB21_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB21_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_av_a__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_av_a__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB22_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB22_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB22_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB22_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @flat_atomic_cmpxchg_i64_ret_a_av__av(ptr %ptr) #0 {
+; CHECK-LABEL: flat_atomic_cmpxchg_i64_ret_a_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0
+; CHECK-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB23_2
+; CHECK-NEXT: ; %bb.1: ; %atomicrmw.global
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[6:7], v[0:3] glc
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: .LBB23_2: ; %Flow
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; CHECK-NEXT: s_cbranch_execz .LBB23_4
+; CHECK-NEXT: ; %bb.3: ; %atomicrmw.private
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; CHECK-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; CHECK-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4
+; CHECK-NEXT: .LBB23_4: ; %atomicrmw.phi
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
new file mode 100644
index 0000000..6680947
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomic-cmpxchg.ll
@@ -0,0 +1,728 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; i32 cases
+;---------------------------------------------------------------------
+
+define void @global_atomic_cmpxchg_i32_ret_av_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_av_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_av_av__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_av_av__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_av_av__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_av_av__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_a_a__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_a_a__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_a_a__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_a_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_v_a__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_v_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_a_v__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_a_v__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_v_v__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_v_v__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_av_v__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_av_v__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_v_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_v_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_av_a__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_av_a__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i32_ret_a_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i32_ret_a_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+;---------------------------------------------------------------------
+; i64 cases
+;---------------------------------------------------------------------
+
+define void @global_atomic_cmpxchg_i64_ret_av_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_av_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_av_av__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_av_av__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_av_av__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_av_av__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_a_a__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_a_a__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_a_a__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_a_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_v_a__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_v_a__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_a_v__v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_a_v__v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_v_v__a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_v_v__a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_av_v__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_av_v__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=v"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_v_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_v_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=v"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_av_a__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_av_a__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=^VA"()
+ %data1 = call i64 asm "; def $0", "=a"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_cmpxchg_i64_ret_a_av__av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_cmpxchg_i64_ret_a_av__av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[2:5], off offset:80 glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data0 = call i64 asm "; def $0", "=a"()
+ %data1 = call i64 asm "; def $0", "=^VA"()
+ %pair = cmpxchg ptr addrspace(1) %gep.0, i64 %data0, i64 %data1 seq_cst monotonic
+ %result = extractvalue { i64, i1 } %pair, 0
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
new file mode 100644
index 0000000..7c36642
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -0,0 +1,1539 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+;---------------------------------------------------------------------
+; xchg i32 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @global_atomic_xchg_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @global_atomic_xchg_i32_ret_a_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @global_atomic_xchg_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @global_atomic_xchg_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @global_atomic_xchg_i32_ret_av_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @global_atomic_xchg_i32_ret_av_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @global_atomic_xchg_i32_ret_a_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @global_atomic_xchg_i32_ret_v_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @global_atomic_xchg_i32_noret_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v[0:1], a0, off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %unused = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ ret void
+}
+
+define void @global_atomic_xchg_i32_noret_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i32_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap v[0:1], v2, off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xchg i64 cases
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @global_atomic_xchg_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @global_atomic_xchg_i64_ret_a_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @global_atomic_xchg_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @global_atomic_xchg_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @global_atomic_xchg_i64_ret_av_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @global_atomic_xchg_i64_ret_av_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @global_atomic_xchg_i64_ret_a_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @global_atomic_xchg_i64_ret_v_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_xchg_i64_noret_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %unused = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ ret void
+}
+
+define void @global_atomic_xchg_i64_noret_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xchg_i64_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xor i32 cases with cmpxchg expansion
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v3, v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB21_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB21_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB22_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
+; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB22_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v3, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB23_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB23_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB24_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
+; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB24_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB25_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
+; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB25_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "v"(i32 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v3, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB26_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB26_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "a"(i32 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB27_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
+; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB27_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v2, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v3
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB28_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
+; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB28_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: v_accvgpr_write_b32 a33, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a32
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a33
+; CHECK-NEXT: global_load_dword v1, v[4:5], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB29_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v3, v1
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v0
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB29_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a32, v1
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a32
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ call void asm "; use $0", "^VA"(i32 %result)
+ ret void
+}
+
+define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v3, v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB30_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB30_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=a"()
+ %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ ret void
+}
+
+define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i32_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dword v3, v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v4
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB31_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v3, v2
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB31_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i32 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst
+ ret void
+}
+
+;---------------------------------------------------------------------
+; xor i64 cases with cmpxchg expansion
+;---------------------------------------------------------------------
+
+; Input and result use AGPR
+define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB32_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB32_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AGPR, result used as VGPR.
+define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB33_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3
+; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB33_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is VGPR, result used as AGPR
+define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_v_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[6:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB34_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB34_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is AV, result also used as AV
+define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB35_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3
+; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB35_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as v
+define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB36_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3
+; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB36_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "v"(i64 %result)
+ ret void
+}
+
+; Input is AV, used as a
+define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[6:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB37_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v2
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v3
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB37_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "a"(i64 %result)
+ ret void
+}
+
+; Input is a, result used as AV
+define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB38_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3
+; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB38_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+; Input is v, result used as AV
+define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_v_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[2:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB39_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3
+; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB39_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[4:5]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=v"()
+ %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ call void asm "; use $0", "^VA"(i64 %result)
+ ret void
+}
+
+define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_noret_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a0
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB40_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB40_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=a"()
+ %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ ret void
+}
+
+define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: global_atomic_xor_expansion_i64_noret_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[6:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB41_1: ; %atomicrmw.start
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6
+; CHECK-NEXT: buffer_wbl2
+; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_invl2
+; CHECK-NEXT: buffer_wbinvl1_vol
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_cbranch_execnz .LBB41_1
+; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
+; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10
+ %data = call i64 asm "; def $0", "=^VA"()
+ %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index be4e369..002ccd6 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -203,7 +203,7 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(ptr addrspace(1)
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tid
- %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 syncscope("agent") seq_cst
+ %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
%tmp0 = insertelement <4 x i32> poison, i32 %in.1, i32 0
%tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
%tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
@@ -229,7 +229,7 @@ define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(ptr addrspace(1
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid
- %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 syncscope("agent") seq_cst
+ %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
%tmp0 = insertelement <2 x i64> poison, i64 %in.1, i32 0
%tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
%tmp2 = bitcast <2 x i64> %tmp1 to <4 x i32>
@@ -319,3 +319,5 @@ exit:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index eedd56d..a8560e8 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -1,12 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s
-
-declare i32 @llvm.amdgcn.workitem.id.x() readnone
-
-; SI-LABEL: {{^}}test_i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_i64_vreg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid
@@ -18,10 +35,22 @@ define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addr
}
; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: {{^}}sgpr_operand:
-; SI: s_add_u32
-; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) {
+; SI-LABEL: sgpr_operand:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s4, s6, s4
+; SI-NEXT: s_addc_u32 s5, s7, s5
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
%foo = load i64, ptr addrspace(1) %in, align 8
%result = add i64 %foo, %a
store i64 %result, ptr addrspace(1) %out
@@ -30,35 +59,76 @@ define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrs
; Swap the arguments. Check that the SGPR -> VGPR copy works with the
; SGPR as other operand.
-;
-; SI-LABEL: {{^}}sgpr_operand_reversed:
-; SI: s_add_u32
-; SI: s_addc_u32
define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) {
+; SI-LABEL: sgpr_operand_reversed:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s4, s4, s6
+; SI-NEXT: s_addc_u32 s5, s5, s7
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
%foo = load i64, ptr addrspace(1) %in, align 8
%result = add i64 %a, %foo
store i64 %result, ptr addrspace(1) %out
ret void
}
-
-; SI-LABEL: {{^}}test_v2i64_sreg:
-; SI: s_add_u32
-; SI: s_addc_u32
-; SI: s_add_u32
-; SI: s_addc_u32
define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) {
+; SI-LABEL: test_v2i64_sreg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s4, s10, s14
+; SI-NEXT: s_addc_u32 s5, s11, s15
+; SI-NEXT: s_add_u32 s6, s8, s12
+; SI-NEXT: s_addc_u32 s7, s9, s13
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_mov_b32_e32 v3, s5
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
%result = add <2 x i64> %a, %b
store <2 x i64> %result, ptr addrspace(1) %out
ret void
}
-; SI-LABEL: {{^}}test_v2i64_vreg:
-; SI: v_add_i32
-; SI: v_addc_u32
-; SI: v_add_i32
-; SI: v_addc_u32
define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
+; SI-LABEL: test_v2i64_vreg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; SI-NEXT: v_mov_b32_e32 v5, 0
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
+; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64
+; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
%a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid
%b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid
@@ -69,14 +139,19 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad
ret void
}
-; SI-LABEL: {{^}}trunc_i64_add_to_i32:
-; SI: s_load_dword s[[SREG0:[0-9]+]]
-; SI: s_load_dword s[[SREG1:[0-9]+]]
-; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: addc
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: buffer_store_dword [[VRESULT]],
define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) {
+; SI-LABEL: trunc_i64_add_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[4:5], 0xd
+; SI-NEXT: s_load_dword s6, s[4:5], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s4, s6, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
%add = add i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 28f5551..42c7b90 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -169,6 +169,6 @@ attributes #1 = { nounwind }
;.
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
new file mode 100644
index 0000000..4b6375c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+; Test code sequences for addrspacecast with globally addressable scratch.
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1
+; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
+; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspace(5) %ptr) {
+; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast_nonnull:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
+; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %stof = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %ptr)
+ store volatile i32 0, ptr %stof
+ ret void
+}
+
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
+; GFX1250-LABEL: use_flat_to_private_addrspacecast:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
+ %ftos = addrspacecast ptr %ptr to ptr addrspace(5)
+ store volatile i32 0, ptr addrspace(5) %ftos
+ ret void
+}
+
+define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
+; GFX1250-SDAG-LABEL: use_flat_to_private_addrspacecast_nonnull:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: use_flat_to_private_addrspacecast_nonnull:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1
+; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: s_endpgm
+ %ftos = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr %ptr)
+ store volatile i32 0, ptr addrspace(5) %ftos
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/allow-check.ll b/llvm/test/CodeGen/AMDGPU/allow-check.ll
index d4f5621..162a8bcf 100644
--- a/llvm/test/CodeGen/AMDGPU/allow-check.ll
+++ b/llvm/test/CodeGen/AMDGPU/allow-check.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=0 | FileCheck %s
-; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -new-reg-bank-select -fast-isel=0 | FileCheck %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=1 | FileCheck %s
define i1 @test_runtime() local_unnamed_addr {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
new file mode 100644
index 0000000..eac0767
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
+
+declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
+
+define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
+; DAGISEL-LABEL: basic_test:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
+; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
+; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
+; DAGISEL-NEXT: s_clause 0x1
+; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
+; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: basic_test:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: v_writelane_b32 v42, s0, 2
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v41, s33
+; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
+; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; GISEL-NEXT: v_writelane_b32 v42, s30, 0
+; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
+; GISEL-NEXT: s_add_co_i32 s32, s32, 16
+; GISEL-NEXT: v_writelane_b32 v42, s31, 1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: global_store_b32 v[40:41], v0, off
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: scratch_load_b32 v41, off, s33
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GISEL-NEXT: v_readlane_b32 s31, v42, 1
+; GISEL-NEXT: v_readlane_b32 s30, v42, 0
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: v_readlane_b32 s0, v42, 2
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %y = add i32 %x, 13
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
+ store i32 %ret, ptr addrspace(1) %ptr
+ ret void
+}
+
+declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
+
+define amdgpu_gfx void @ret_void(i32 %x) {
+; DAGISEL-LABEL: ret_void:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
+; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
+; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
+; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
+; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
+; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
+; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: ret_void:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: v_writelane_b32 v40, s0, 2
+; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
+; GISEL-NEXT: s_add_co_i32 s32, s32, 16
+; GISEL-NEXT: v_writelane_b32 v40, s30, 0
+; GISEL-NEXT: v_writelane_b32 v40, s31, 1
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_readlane_b32 s31, v40, 1
+; GISEL-NEXT: v_readlane_b32 s30, v40, 0
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: v_readlane_b32 s0, v40, 2
+; GISEL-NEXT: s_or_saveexec_b32 s1, -1
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_mov_b32 exec_lo, s1
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
index 77c9b98..3e80a58 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GISEL-GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GISEL-GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=DAGISEL-GFX12 %s
declare amdgpu_cs_chain void @callee(<3 x i32> inreg, { i32, ptr addrspace(5), i32, i32 })
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 0d5f538..d03d6a8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -6549,307 +6549,266 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
@@ -15413,63 +15372,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -15483,144 +15442,143 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15634,746 +15592,660 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -42156,64 +42028,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -42241,50 +42113,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -42328,50 +42200,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB36_4: ; %end
@@ -42379,307 +42251,266 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
@@ -52210,63 +52041,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -52280,144 +52111,143 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -52431,746 +52261,660 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -77938,64 +77682,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -78023,50 +77767,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -78135,50 +77879,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB56_4: ; %end
@@ -78186,307 +77930,266 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
@@ -87060,63 +86763,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -87130,144 +86833,143 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -87281,746 +86983,660 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -111800,64 +111416,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
@@ -111885,50 +111501,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
@@ -111972,50 +111588,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1
; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
@@ -112023,307 +111639,266 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
@@ -121839,63 +121414,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136
@@ -121909,144 +121484,143 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -122060,746 +121634,660 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -160089,159 +159577,162 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:32
-; GFX11-TRUE16-NEXT: s_clause 0x4
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:44
+; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:12
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -160250,143 +159741,142 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v108.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v110.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v109.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v31.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v32.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v32.h
; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v80.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v36, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
@@ -160399,498 +159889,500 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v37, v51 :: v_dual_and_b32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v20, 16, v20
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v82.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55
+; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v81
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v83.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v85, v33, v37
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v85.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v84
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v86.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v19, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_lshlrev_b32 v24, 16, v24
+; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v87
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v96.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v99.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26
; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85
+; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v98
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v100.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v102.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v26, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v101
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v103.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v112
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25
-; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
+; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v113.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v32
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h
-; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v114.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v34, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v116.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v115
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25
; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v31, 16, v31
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v117.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v118, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v119, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v119.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v32, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114
+; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v118
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v128.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v31, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v34, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v132
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v131.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v129
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2
+; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v36 :: v_dual_add_f32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v146.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v133.h
+; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v135
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v166.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v6, v33, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v151
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v5, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v177.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33
; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff
; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v43, v35, v37, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v43.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v44, v35, v38 :: v_dual_and_b32 v39, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v41.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41
-; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v44
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10
-; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v7, 16, v9
; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v51 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39
; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v62, v48, v52, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v62
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_cndmask_b32 v92, v37, v38
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35
@@ -160898,18 +160390,18 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v39, v48, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff
@@ -160919,7 +160411,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v108, v35, v39, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
@@ -160927,405 +160419,366 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37
; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v107, v13, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37
; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v110, v39, v51, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v108.h
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v91.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v92.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v109, v35, v48, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v110.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v107.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v89
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14
-; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v109
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13
; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v37, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v34, v33
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7
; GFX11-TRUE16-NEXT: .LBB90_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v111.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v107.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v105.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v146.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v104.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v93.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v166.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v88.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v78.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v150.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v151.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v76.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v6, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v43.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v74.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v12.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v62.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v6, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v177.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v180.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v6, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v73.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v6, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v41.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v44.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v45.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v92.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v40.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v59.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v182.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v89.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v108.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v176.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v6, v10
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v91.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v6, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v110.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v109.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v6, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v107.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v6, v13
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v94.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v6, v14
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v79.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v90.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v6, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v77.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v6, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v72.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v75.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v6, v17
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v6, v18
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v56.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v6, v19
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v46.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v6, v20
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v183.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v42.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v6, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v181.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v6, v22
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v179.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v6, v23
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v6, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.l, v25.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.l, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v6, v25
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v148.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v6, v26
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.l, v27.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v6, v27
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v118.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v6, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v132.l
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v6, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.l, v30.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v14
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v6, v5
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:48
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:64
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[19:22], off offset:80
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:96
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:112
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:136
-; GFX11-TRUE16-NEXT: s_clause 0x4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:136
+; GFX11-TRUE16-NEXT: s_clause 0x7
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:168
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -185302,69 +184755,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -185375,69 +184828,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4
@@ -185446,405 +184899,364 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
@@ -208055,69 +207467,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -208128,69 +207540,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4
@@ -208199,405 +207611,364 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
-; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
+; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
; GFX11-TRUE16-NEXT: s_clause 0x5
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 3e96ab1..21ec3ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -4118,19 +4118,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -4144,103 +4144,95 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8592,19 +8584,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -8618,103 +8610,95 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -12682,19 +12666,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -12708,103 +12692,95 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16382,19 +16358,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -16408,103 +16384,95 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19811,19 +19779,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -19837,103 +19805,95 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -22725,19 +22685,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -22751,103 +22711,95 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -24944,19 +24896,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -24970,103 +24922,95 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index f8ffaa4..38302a7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6296,32 +6296,31 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -6333,194 +6332,175 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13335,32 +13315,31 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -13372,194 +13351,175 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19892,32 +19852,31 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -19929,194 +19888,175 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -25939,32 +25879,31 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -25976,194 +25915,175 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 0cefbc1..436b1a0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow
@@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB12_4: ; %end
@@ -3047,105 +3047,93 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
@@ -5038,48 +5026,49 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
@@ -5092,245 +5081,217 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9951,20 +9912,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -9980,17 +9941,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow
@@ -10010,17 +9971,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB32_4: ; %end
@@ -10028,105 +9989,93 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
@@ -12037,48 +11986,49 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3
@@ -12091,245 +12041,217 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16358,20 +16280,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -16387,17 +16309,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow
@@ -16421,17 +16343,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_4: ; %end
@@ -16439,105 +16361,93 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8:
@@ -22479,20 +22389,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -22508,17 +22418,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow
@@ -22542,17 +22452,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB60_4: ; %end
@@ -22560,105 +22470,93 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
@@ -28859,50 +28757,51 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3
@@ -28915,245 +28814,216 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -30908,20 +30778,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -30937,17 +30807,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow
@@ -30966,17 +30836,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB74_4: ; %end
@@ -30984,105 +30854,93 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
@@ -33010,50 +32868,51 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3
@@ -33066,245 +32925,216 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -35074,20 +34904,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
@@ -35103,17 +34933,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow
@@ -35140,17 +34970,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-TRUE16-NEXT: .LBB78_4: ; %end
@@ -35158,105 +34988,93 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 48c9b87..8e30ee6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -2273,19 +2273,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -2295,16 +2293,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -4506,8 +4502,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -4522,19 +4518,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
@@ -4544,16 +4538,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -6467,8 +6459,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -6483,19 +6475,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -6505,16 +6495,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -8116,8 +8104,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -8132,19 +8120,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
@@ -8154,16 +8140,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -9479,8 +9463,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -9495,19 +9479,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
@@ -9517,16 +9499,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -10193,8 +10173,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4
@@ -10209,19 +10189,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
@@ -10231,16 +10209,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 5aac06a..35d135b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow
@@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB24_4: ; %end
@@ -8891,156 +8891,135 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -12470,15 +12449,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
@@ -12492,84 +12471,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -12581,384 +12558,338 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -23588,32 +23519,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -23632,26 +23563,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow
@@ -23676,26 +23607,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB48_4: ; %end
@@ -23703,156 +23634,135 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -27413,15 +27323,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
@@ -27435,84 +27345,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -27524,384 +27432,338 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -37916,32 +37778,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -37960,26 +37822,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow
@@ -38017,26 +37879,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB68_4: ; %end
@@ -38044,156 +37906,135 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -41628,15 +41469,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
@@ -41650,84 +41491,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -41739,384 +41578,338 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -51295,32 +51088,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -51339,26 +51132,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow
@@ -51383,26 +51176,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB84_4: ; %end
@@ -51410,156 +51203,135 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -54989,15 +54761,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16
@@ -55011,84 +54783,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow
@@ -55100,384 +54870,338 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -64573,32 +64297,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -64617,26 +64341,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow
@@ -64669,26 +64393,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB96_4: ; %end
@@ -64696,156 +64420,135 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -76701,32 +76404,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
@@ -76745,26 +76448,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow
@@ -76797,26 +76500,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1
; GFX11-TRUE16-NEXT: .LBB104_4: ; %end
@@ -76824,156 +76527,135 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -85692,59 +85374,59 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
@@ -85757,307 +85439,302 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v16.h
; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v18, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.h
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v18, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v30
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4
-; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8
-; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v34.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v5, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34
+; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v7, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v19, v22, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v37.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v21
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48
+; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v39
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1
; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v49
; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v22, v48 :: v_dual_add_f32 v9, 0x40c00000, v23
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v19, v25, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38
-; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52
-; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9
+; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v55.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v23, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9
; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v69.h
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v15
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v23, v13, 0x7fff
@@ -86067,42 +85744,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16
; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v13, v25, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14
-; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v15, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v23, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h
-; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h
+; GFX11-TRUE16-NEXT: v_add3_u32 v13, v48, v15, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h
+; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v83.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v87.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9
+; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v86
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13
; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12]
@@ -86111,159 +85788,142 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v103.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v24
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v87.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 6fe6665..4c48576 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -3065,13 +3065,12 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -3085,61 +3084,53 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6214,13 +6205,12 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -6234,61 +6224,53 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9063,13 +9045,12 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -9083,61 +9064,53 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11603,13 +11576,12 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -11623,61 +11595,53 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13829,13 +13793,12 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -13849,61 +13812,53 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -15655,13 +15610,12 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -15675,61 +15629,53 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16966,13 +16912,12 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8
@@ -16986,61 +16931,53 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index e5245f7..879e852 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1102,15 +1102,16 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -1125,80 +1126,74 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4241,15 +4236,16 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -4264,80 +4260,74 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6885,16 +6875,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -6909,80 +6899,74 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2
; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8651,16 +8635,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -8675,80 +8659,74 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2
; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10065,16 +10043,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -10089,80 +10067,74 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2
; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 181dab8..7e9cb7a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -105,7 +105,7 @@ declare void @unknown()
define amdgpu_kernel void @kernel_calls_extern() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: call void @unknown()
; CHECK-NEXT: ret void
;
@@ -115,8 +115,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR3]] {
-; CHECK-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR6:[0-9]+]]
; CHECK-NEXT: ret void
;
call void @unknown() #0
@@ -125,7 +125,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
; CHECK-NEXT: call void [[INDIRECT]]()
; CHECK-NEXT: ret void
;
@@ -135,8 +135,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR7]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR6]]
; CHECK-NEXT: ret void
;
call void %indirect() #0
@@ -254,12 +254,11 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll
index 1da8cd6..9666085 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll
@@ -14,9 +14,9 @@ define void @uniform_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8) n
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: .LBB0_2: ; %if.end
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -61,9 +61,9 @@ define void @uniform_br_same_weight(i32 noundef inreg %value, ptr addrspace(8) n
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: .LBB1_2: ; %if.end
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -108,9 +108,9 @@ define void @uniform_br_then_likely(i32 noundef inreg %value, ptr addrspace(8) n
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: .LBB2_2: ; %if.end
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -156,9 +156,9 @@ define void @divergent_br_no_metadata(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: .LBB3_2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,9 +227,9 @@ define void @divergent_br_same_weight(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: .LBB4_2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -297,9 +297,9 @@ define void @divergent_br_then_likely(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_mov_b32 s6, s19
; GFX9-NEXT: s_mov_b32 s5, s18
; GFX9-NEXT: s_mov_b32 s4, s17
-; GFX9-NEXT: v_mov_b32_e32 v0, s16
-; GFX9-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-NEXT: v_mov_b32_e32 v0, s21
+; GFX9-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; GFX9-NEXT: ; %bb.2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
new file mode 100644
index 0000000..d6922bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o %t.o -mcpu=gfx1030 -filetype=obj -O0
+; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s
+
+; This test compiles this module with AMDGPU backend under -O0,
+; and makes sure llvm-debuginfo-analyzer works for it.
+
+; Simple checks to make sure llvm-debuginfo-analzyer didn't fail early.
+; CHECK: Logical View:
+; CHECK: {CompileUnit}
+; CHECK-DAG: {Parameter} 'dtid' -> [0x{{[a-f0-9]+}}]'uint3'
+; CHECK-DAG: {Variable} 'my_var2' -> [0x{{[a-f0-9]+}}]'float'
+; CHECK-DAG: {Line} {{.+}}basic_var.hlsl
+; CHECK: {Code} 's_endpgm'
+
+source_filename = "module"
+target triple = "amdgcn-amd-amdpal"
+
+%dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !14 {
+ %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i64 0, !dbg !28
+ %WorkgroupId.i0 = extractelement <3 x i32> %WorkgroupId, i64 0, !dbg !28
+ %pc = call i64 @llvm.amdgcn.s.getpc(), !dbg !28
+ %offset = shl i32 %WorkgroupId.i0, 6, !dbg !28
+ %dtid = add i32 %LocalInvocationId.i0, %offset, !dbg !28
+ #dbg_value(i32 %dtid, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !28)
+ %pc_hi = and i64 %pc, -4294967296, !dbg !30
+ %zext = zext i32 %userdata4 to i64, !dbg !30
+ %ptr_val = or disjoint i64 %pc_hi, %zext, !dbg !30
+ %ptr = inttoptr i64 %ptr_val to ptr addrspace(4), !dbg !30
+ call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %ptr, i32 4), "dereferenceable"(ptr addrspace(4) %ptr, i32 -1) ], !dbg !30
+ %uav_0 = load <4 x i32>, ptr addrspace(4) %ptr, align 4, !dbg !30, !invariant.load !2
+ %uav_load_1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %uav_0, i32 %dtid, i32 0, i32 0, i32 0), !dbg !30
+ #dbg_value(%dx.types.ResRet.f32 poison, !31, !DIExpression(), !32)
+ %mul = fmul reassoc arcp contract afn float %uav_load_1, 2.000000e+00, !dbg !33
+ #dbg_value(float %mul, !34, !DIExpression(), !35)
+ call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %ptr, i32 4), "dereferenceable"(ptr addrspace(4) %ptr, i32 -1) ], !dbg !36
+ %uav_1_ptr = getelementptr i8, ptr addrspace(4) %ptr, i64 32, !dbg !36
+ %.upto01 = insertelement <4 x float> poison, float %mul, i64 0, !dbg !36
+ %filled_vector = shufflevector <4 x float> %.upto01, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !36
+ %uav_1 = load <4 x i32>, ptr addrspace(4) %uav_1_ptr, align 4, !dbg !36, !invariant.load !2
+ call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %filled_vector, <4 x i32> %uav_1, i32 %dtid, i32 0, i32 0, i32 0), !dbg !36
+ ret void, !dbg !37
+}
+
+declare noundef i64 @llvm.amdgcn.s.getpc() #1
+
+declare void @llvm.assume(i1 noundef) #2
+
+declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #3
+
+declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #4
+
+attributes #0 = { memory(readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(read) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "dxcoob 1.7.2308.16 (52da17e29)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+!1 = !DIFile(filename: "tests\\basic_var.hlsl", directory: "")
+!2 = !{}
+!3 = !{!4, !10}
+!4 = distinct !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!5 = !DIGlobalVariable(name: "u0", linkageName: "\01?u0@@3V?$RWBuffer@M@@A", scope: !0, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true)
+!6 = !DICompositeType(tag: DW_TAG_class_type, name: "RWBuffer<float>", file: !1, line: 2, size: 32, align: 32, elements: !2, templateParams: !7)
+!7 = !{!8}
+!8 = !DITemplateTypeParameter(name: "element", type: !9)
+!9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!10 = distinct !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = !DIGlobalVariable(name: "u1", linkageName: "\01?u1@@3V?$RWBuffer@M@@A", scope: !0, file: !1, line: 3, type: !6, isLocal: false, isDefinition: true)
+!12 = !{i32 2, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!14 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !15, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null, !17}
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint3", file: !1, baseType: !18)
+!18 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<unsigned int, 3>", file: !1, size: 96, align: 32, elements: !19, templateParams: !24)
+!19 = !{!20, !22, !23}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !18, file: !1, baseType: !21, size: 32, align: 32, flags: DIFlagPublic)
+!21 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+!22 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!24 = !{!25, !26}
+!25 = !DITemplateTypeParameter(name: "element", type: !21)
+!26 = !DITemplateValueParameter(name: "element_count", type: !27, value: i32 3)
+!27 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!28 = !DILocation(line: 7, column: 17, scope: !14)
+!29 = !DILocalVariable(name: "dtid", arg: 1, scope: !14, file: !1, line: 7, type: !17)
+!30 = !DILocation(line: 11, column: 18, scope: !14)
+!31 = !DILocalVariable(name: "my_var", scope: !14, file: !1, line: 11, type: !9)
+!32 = !DILocation(line: 11, column: 9, scope: !14)
+!33 = !DILocation(line: 14, column: 26, scope: !14)
+!34 = !DILocalVariable(name: "my_var2", scope: !14, file: !1, line: 14, type: !9)
+!35 = !DILocation(line: 14, column: 9, scope: !14)
+!36 = !DILocation(line: 17, column: 14, scope: !14)
+!37 = !DILocation(line: 19, column: 1, scope: !14)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
index 69bdb1f..aaacf1d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir
@@ -13,6 +13,14 @@
ret void
}
+ define void @func64() {
+ ret void
+ }
+
+ define void @func64_no_agprs() "amdgpu-agpr-alloc"="0,0" {
+ ret void
+ }
+
...
---
name: func
@@ -93,3 +101,116 @@ body: |
%1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec
...
+
+---
+name: func64
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 4 }
+body: |
+ ; HAS-AGPR-LABEL: name: func64
+ ; HAS-AGPR: bb.0:
+ ; HAS-AGPR-NEXT: successors: %bb.1(0x80000000)
+ ; HAS-AGPR-NEXT: liveins: $vgpr0_vgpr1
+ ; HAS-AGPR-NEXT: {{ $}}
+ ; HAS-AGPR-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
+ ; HAS-AGPR-NEXT: [[AV_MOV_:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 54, implicit $exec
+ ; HAS-AGPR-NEXT: [[AV_MOV_1:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1, implicit $exec
+ ; HAS-AGPR-NEXT: [[AV_MOV_2:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 %stack.0, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 65, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874240, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874305, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; HAS-AGPR-NEXT: [[AV_MOV_3:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
+ ; HAS-AGPR-NEXT: [[AV_MOV_4:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO $vgpr0_vgpr1, implicit $exec
+ ; HAS-AGPR-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO %stack.0, implicit $exec
+ ; HAS-AGPR-NEXT: {{ $}}
+ ; HAS-AGPR-NEXT: bb.1:
+ ; HAS-AGPR-NEXT: [[AV_MOV_5:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 3, implicit $exec
+ ;
+ ; NO-AGPR-LABEL: name: func64
+ ; NO-AGPR: bb.0:
+ ; NO-AGPR-NEXT: successors: %bb.1(0x80000000)
+ ; NO-AGPR-NEXT: liveins: $vgpr0_vgpr1
+ ; NO-AGPR-NEXT: {{ $}}
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 54, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 64, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 %stack.0, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 65, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874240, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874305, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B6:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B7:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B8:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO $vgpr0_vgpr1, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B9:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO %stack.0, implicit $exec
+ ; NO-AGPR-NEXT: {{ $}}
+ ; NO-AGPR-NEXT: bb.1:
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_4:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 3, implicit $exec
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ %0:vreg_64_align2 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
+ %1:vreg_64_align2 = V_MOV_B64_PSEUDO 54, implicit $exec
+ %2:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
+ %3:vreg_64_align2 = V_MOV_B64_e64 64, implicit $exec
+ %4:vreg_64_align2 = V_MOV_B64_e64 %stack.0, implicit $exec
+ %5:vreg_64_align2 = V_MOV_B64_PSEUDO 65, implicit $exec
+ %6:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874240, implicit $exec
+ %7:vreg_64_align2 = V_MOV_B64_PSEUDO 279172874305, implicit $exec
+ %8:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ %9:vreg_64_align2 = V_MOV_B64_PSEUDO 9223372036854775808, implicit $exec
+ %10:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
+ %11:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
+ %12:vreg_64_align2 = V_MOV_B64_PSEUDO $vgpr0_vgpr1, implicit $exec
+ %13:vreg_64_align2 = V_MOV_B64_PSEUDO %stack.0, implicit $exec
+
+ bb.1:
+ %14:vreg_64_align2 = V_MOV_B64_e64 3, implicit $exec
+
+...
+
+---
+name: func64_no_agprs
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GFX90A-LABEL: name: func64_no_agprs
+ ; GFX90A: liveins: $vgpr0
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
+ ; GFX90A-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ ; GFX90A-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX90A-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
+ ; GFX90A-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
+ ;
+ ; GFX908-LABEL: name: func64_no_agprs
+ ; GFX908: liveins: $vgpr0
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1, implicit $exec
+ ; GFX908-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ ; GFX908-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
+ ; GFX908-NEXT: [[AV_MOV_2:%[0-9]+]]:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
+ ;
+ ; NO-AGPR-LABEL: name: func64_no_agprs
+ ; NO-AGPR: liveins: $vgpr0
+ ; NO-AGPR-NEXT: {{ $}}
+ ; NO-AGPR-NEXT: [[V_MOV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO -9223372036854775808, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
+ ; NO-AGPR-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
+ %0:vreg_64_align2 = V_MOV_B64_e64 1, implicit $exec
+ %1:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329938, implicit $exec
+ %2:vreg_64_align2 = V_MOV_B64_PSEUDO 9223372036854775808, implicit $exec
+ %3:vreg_64_align2 = V_MOV_B64_PSEUDO 1042479491, implicit $exec
+ %4:vreg_64_align2 = V_MOV_B64_PSEUDO 4477415320595726336, implicit $exec
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
index 2872202..7e0208c 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index cdb9801..7f7bbb2 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -442,7 +442,7 @@ define internal void @defined.func() #3 {
define void @func_call_external() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -462,7 +462,7 @@ define void @func_call_defined() #3 {
define void @func_call_asm() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
-; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR25:[0-9]+]]
+; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR24:[0-9]+]]
; ATTRIBUTOR_HSA-NEXT: ret void
;
call void asm sideeffect "", ""() #3
@@ -471,7 +471,7 @@ define void @func_call_asm() #3 {
define amdgpu_kernel void @kern_call_external() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -515,7 +515,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
define float @func_indirect_call(ptr %fptr) #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]()
; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]]
@@ -528,7 +528,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
declare float @extern() #3
define float @func_extern_call() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern()
; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]]
@@ -540,7 +540,7 @@ define float @func_extern_call() #3 {
define float @func_null_call(ptr %fptr) #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null()
; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]]
@@ -568,7 +568,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_sanitize_address() #4 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -579,7 +579,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_sanitize_address() #4 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -590,7 +590,7 @@ define void @func_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_indirect_sanitize_address() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -601,7 +601,7 @@ define void @func_indirect_sanitize_address() #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -615,7 +615,7 @@ declare void @extern_func_sanitize_address() #5
define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -627,7 +627,7 @@ declare void @enqueue_block_decl() #6
define internal void @enqueue_block_def() #6 {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
;
ret void
@@ -635,7 +635,7 @@ define internal void @enqueue_block_def() #6 {
define amdgpu_kernel void @kern_call_enqueued_block_decl() {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -645,7 +645,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
define amdgpu_kernel void @kern_call_enqueued_block_def() {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def()
; ATTRIBUTOR_HSA-NEXT: ret void
;
@@ -655,7 +655,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
define void @unused_enqueue_block() {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] {
; ATTRIBUTOR_HSA-NEXT: ret void
;
ret void
@@ -663,7 +663,7 @@ define void @unused_enqueue_block() {
define internal void @known_func() {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] {
; ATTRIBUTOR_HSA-NEXT: ret void
;
ret void
@@ -672,8 +672,8 @@ define internal void @known_func() {
; Should never happen
define amdgpu_kernel void @kern_callsite_enqueue_block() {
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] {
-; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR26:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23]] {
+; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR25:[0-9]+]]
; ATTRIBUTOR_HSA-NEXT: ret void
;
call void @known_func() #6
@@ -691,30 +691,29 @@ attributes #6 = { "enqueued-block" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "enqueued-block" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 894ef4f..26c04a3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -303,7 +303,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
; HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]]
; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -315,7 +315,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
; HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
; HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META1:![0-9]+]]
; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -352,7 +352,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META2:![0-9]+]]
; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -364,7 +364,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
+; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4, !noalias.addrspace [[META3:![0-9]+]]
; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
@@ -474,17 +474,22 @@ attributes #1 = { nounwind }
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+;.
+; HSA: [[META0]] = !{i32 1, i32 3, i32 4, i32 10}
+; HSA: [[META1]] = !{i32 1, i32 5, i32 6, i32 10}
+; HSA: [[META2]] = !{i32 2, i32 10}
+; HSA: [[META3]] = !{i32 1, i32 4, i32 5, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index aeca138..81ccf16 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -294,13 +294,13 @@ attributes #1 = { nounwind }
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/ashr64_reduce_flags.ll b/llvm/test/CodeGen/AMDGPU/ashr64_reduce_flags.ll
index 59f3a49..70db5fa 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr64_reduce_flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr64_reduce_flags.ll
@@ -20,14 +20,12 @@ define i64 @ashr_exact(i64 %arg0, i64 %shift_amt) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[V_ASHRREV_I32_e64_:%[0-9]+]]:vgpr_32 = exact V_ASHRREV_I32_e64 killed [[COPY5]], [[COPY3]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 45192be..9a4040a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -1,24 +1,94 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x13
+; CHECK-NEXT: s_load_dword s3, s[4:5], 0x1c
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CHECK-NEXT: v_mov_b32_e32 v0, 7
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: v_mov_b32_e32 v2, s3
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s3, 0xf000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; CHECK-NEXT: s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x13
+; GFX7-NEXT: s_load_dword s3, s[4:5], 0x1c
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT: v_mov_b32_e32 v0, 7
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s2
+; GFX7-NEXT: v_mov_b32_e32 v2, s3
+; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x4c
+; GFX8-NEXT: s_load_dword s3, s[4:5], 0x70
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 7
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v1, v0, v2 offset:16
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4c
+; GFX9-NEXT: s_load_dword s3, s[4:5], 0x70
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x4c
+; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x70
+; GFX11-NEXT: v_mov_b32_e32 v0, 7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0
@@ -26,24 +96,100 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o
ret void
}
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: [[RESULT]]
-; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CHECK-NEXT: v_mov_b32_e32 v0, 7
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s3, 0xf000
+; CHECK-NEXT: s_mov_b32 s2, -1
+; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT: s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT: v_mov_b32_e32 v0, 7
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, s6
+; GFX7-NEXT: v_mov_b32_e32 v2, s2
+; GFX7-NEXT: v_mov_b32_e32 v3, s3
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 7
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 7
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-NEXT: v_mov_b32_e32 v0, 7
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
%result = extractvalue { i64, i1 } %pair, 0
@@ -51,13 +197,103 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o
ret void
}
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
-; GFX9PLUS-NOT: m0
-; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
-; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_load_dword s3, s[4:5], 0xb
+; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; CHECK-NEXT: s_sub_i32 s1, s1, s2
+; CHECK-NEXT: s_lshl_b32 s1, s1, 2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_add_i32 s1, s3, s1
+; CHECK-NEXT: s_add_i32 s1, s1, 16
+; CHECK-NEXT: v_mov_b32_e32 v0, 7
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: v_mov_b32_e32 v2, s1
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s7, 0xf000
+; CHECK-NEXT: s_mov_b32 s6, -1
+; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; CHECK-NEXT: s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GFX7-NEXT: v_mov_b32_e32 v0, 7
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_sub_i32 s2, s2, s3
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: s_lshl_b32 s1, s2, 2
+; GFX7-NEXT: s_add_i32 s0, s0, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1 offset:16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, -1
+; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v0, 7
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_sub_i32 s2, s2, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_lshl_b32 s1, s2, 2
+; GFX8-NEXT: s_add_i32 s0, s0, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v2, v0, v1 offset:16
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_lshl_b32 s2, s2, 2
+; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: v_mov_b32_e32 v0, 7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_sub_i32 s2, s2, s3
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: s_lshl_b32 s2, s2, 2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%sub = sub i32 %a, %b
%add = add i32 %sub, 4
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add
@@ -67,45 +303,152 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
ret void
}
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
-; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
-; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16
-; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x9
+; CHECK-NEXT: s_load_dword s1, s[4:5], 0x12
+; CHECK-NEXT: v_mov_b32_e32 v0, 7
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: v_mov_b32_e32 v2, s1
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[4:5], 0x9
+; GFX7-NEXT: s_load_dword s1, s[4:5], 0x12
+; GFX7-NEXT: v_mov_b32_e32 v0, 7
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x48
+; GFX8-NEXT: v_mov_b32_e32 v0, 7
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24
+; GFX9-NEXT: s_load_dword s1, s[4:5], 0x48
+; GFX9-NEXT: v_mov_b32_e32 v0, 7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x48
+; GFX11-NEXT: v_mov_b32_e32 v0, 7
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: ds_cmpstore_b32 v1, v2, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0
ret void
}
-; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
-; GFX9PLUS-NOT: m0
-; SICIVI-DAG: s_mov_b32 m0
-
-; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
-; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
-; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
-; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32
-; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32
-; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind {
+; CHECK-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x9
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; CHECK-NEXT: v_mov_b32_e32 v0, 7
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v4, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_endpgm
+;
+; GFX7-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GFX7-NEXT: v_mov_b32_e32 v0, 7
+; GFX7-NEXT: v_mov_b32_e32 v1, 0
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, s2
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8-NEXT: v_mov_b32_e32 v0, 7
+; GFX8-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-NEXT: s_mov_b32 m0, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v0, 7
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_cmpxchg_noret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX11-NEXT: v_mov_b32_e32 v0, 7
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: ds_cmpstore_b64 v4, v[2:3], v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
%pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic
%result = extractvalue { i64, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 4cc39d9..0fccdba 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -328,7 +328,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -655,7 +655,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i32 %additive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -1565,7 +1565,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -1899,7 +1899,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -2284,7 +2284,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i64 %additive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -3545,7 +3545,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
- %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
+ %old = atomicrmw add ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -3861,7 +3861,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i32 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -4190,7 +4190,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i32 %subitive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -5100,7 +5100,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_endpgm
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
- %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i32 %lane syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i32 %old, ptr addrspace(1) %out
ret void
}
@@ -5454,7 +5454,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i64 5 syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -5850,7 +5850,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX1232-NEXT: s_endpgm
entry:
- %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i64 %subitive syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -7111,7 +7111,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
entry:
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%zext = zext i32 %lane to i64
- %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel
+ %old = atomicrmw sub ptr addrspace(1) %inout, i64 %zext syncscope("agent") acq_rel, !amdgpu.no.fine.grained.memory !0
store i64 %old, ptr addrspace(1) %out
ret void
}
@@ -7616,7 +7616,7 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
+ %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1, !amdgpu.no.fine.grained.memory !0
store i8 %rmw, ptr addrspace(1) %result
ret void
}
@@ -8432,7 +8432,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
+ %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1, !amdgpu.no.fine.grained.memory !0
store i8 %rmw, ptr addrspace(1) %result
ret void
}
@@ -8804,7 +8804,7 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
- %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
+ %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1, !amdgpu.no.fine.grained.memory !0
store i8 %rmw, ptr addrspace(1) %result
ret void
}
@@ -9022,13 +9022,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0
; GFX1164-TRUE16-NEXT: .LBB15_2:
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -9101,13 +9100,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0
; GFX1132-TRUE16-NEXT: .LBB15_2:
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -9180,13 +9178,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0
; GFX1264-TRUE16-NEXT: .LBB15_2:
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -9259,13 +9256,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0
; GFX1232-TRUE16-NEXT: .LBB15_2:
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -9309,7 +9305,7 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
+ %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0
store i16 %rmw, ptr addrspace(1) %result
ret void
}
@@ -9662,12 +9658,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -9789,12 +9784,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
@@ -9916,13 +9910,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2
; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10048,13 +10041,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2
; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow
; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10125,7 +10117,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
+ %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0
store i16 %rmw, ptr addrspace(1) %result
ret void
}
@@ -10497,7 +10489,7 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
- %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
+ %rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0
store i16 %rmw, ptr addrspace(1) %result
ret void
}
@@ -10734,15 +10726,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
+; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -10828,14 +10820,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
+; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -10920,15 +10912,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0
; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1
+; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -11014,14 +11006,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1
+; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0
+; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -11084,7 +11076,7 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2
+ %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0
store half %rmw, ptr addrspace(1) %result
ret void
}
@@ -11774,7 +11766,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2
+ %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2, !amdgpu.no.fine.grained.memory !0
store bfloat %rmw, ptr addrspace(1) %result
ret void
}
@@ -12045,72 +12037,32 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
; GFX1264-LABEL: uniform_fadd_v2f16:
; GFX1264: ; %bb.0:
; GFX1264-NEXT: s_clause 0x1
+; GFX1264-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT: s_load_b32 s10, s[4:5], 0x34
-; GFX1264-NEXT: s_mov_b64 s[8:9], 0
-; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: v_mov_b32_e32 v0, 0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x0
-; GFX1264-NEXT: s_mov_b32 s5, s3
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mov_b32_e32 v1, s4
-; GFX1264-NEXT: s_mov_b32 s4, s2
-; GFX1264-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_pk_add_f16 v0, v1, s10
-; GFX1264-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT: s_wait_loadcnt 0x0
-; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT: v_mov_b32_e32 v1, v2
-; GFX1264-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_and_not1_b64 exec, exec, s[8:9]
-; GFX1264-NEXT: s_cbranch_execnz .LBB20_1
-; GFX1264-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: uniform_fadd_v2f16:
; GFX1232: ; %bb.0:
; GFX1232-NEXT: s_clause 0x1
+; GFX1232-NEXT: s_load_b32 s6, s[4:5], 0x34
; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT: s_load_b32 s8, s[4:5], 0x34
-; GFX1232-NEXT: s_mov_b32 s9, 0
-; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x0
-; GFX1232-NEXT: s_mov_b32 s5, s3
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mov_b32_e32 v1, s4
-; GFX1232-NEXT: s_mov_b32 s4, s2
-; GFX1232-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_pk_add_f16 v0, v1, s8
-; GFX1232-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT: s_wait_loadcnt 0x0
-; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT: v_mov_b32_e32 v1, v2
-; GFX1232-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX1232-NEXT: s_wait_alu 0xfffe
-; GFX1232-NEXT: s_and_not1_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT: s_cbranch_execnz .LBB20_1
-; GFX1232-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1232-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v2, off, s[0:3], null
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
- %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4
+ %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x half> %val monotonic, align 4, !amdgpu.no.fine.grained.memory !0
store <2 x half> %rmw, ptr addrspace(1) %result
ret void
}
@@ -12605,237 +12557,41 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1132-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], 0
; GFX1132-FAKE16-NEXT: s_endpgm
;
-; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16:
-; GFX1264-TRUE16: ; %bb.0:
-; GFX1264-TRUE16-NEXT: s_clause 0x1
-; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX1264-TRUE16-NEXT: s_mov_b64 s[8:9], 0
-; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1
-; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: s_load_b32 s5, s[2:3], 0x0
-; GFX1264-TRUE16-NEXT: s_and_b32 s10, s4, 0xffff0000
-; GFX1264-TRUE16-NEXT: s_lshl_b32 s11, s4, 16
-; GFX1264-TRUE16-NEXT: s_mov_b32 s4, s2
-; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s5
-; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s3
-; GFX1264-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1264-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1264-TRUE16-NEXT: v_add_f32_e32 v0, s11, v0
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_add_f32_e32 v2, s10, v2
-; GFX1264-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX1264-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1264-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
-; GFX1264-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX1264-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2
-; GFX1264-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[8:9]
-; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
-; GFX1264-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1264-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null
-; GFX1264-TRUE16-NEXT: s_endpgm
-;
-; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16:
-; GFX1264-FAKE16: ; %bb.0:
-; GFX1264-FAKE16-NEXT: s_clause 0x1
-; GFX1264-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1264-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x34
-; GFX1264-FAKE16-NEXT: s_mov_b64 s[2:3], 0
-; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: s_load_b32 s1, s[10:11], 0x0
-; GFX1264-FAKE16-NEXT: s_lshl_b32 s12, s0, 16
-; GFX1264-FAKE16-NEXT: s_and_b32 s13, s0, 0xffff0000
-; GFX1264-FAKE16-NEXT: s_mov_b32 s4, s10
-; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11
-; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s1
-; GFX1264-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1264-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1264-FAKE16-NEXT: v_add_f32_e32 v0, s12, v0
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_add_f32_e32 v2, s13, v2
-; GFX1264-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX1264-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1264-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX1264-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0
-; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2
-; GFX1264-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
-; GFX1264-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1264-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null
-; GFX1264-FAKE16-NEXT: s_endpgm
-;
-; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16:
-; GFX1232-TRUE16: ; %bb.0:
-; GFX1232-TRUE16-NEXT: s_clause 0x1
-; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX1232-TRUE16-NEXT: s_mov_b32 s8, 0
-; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1
-; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: s_load_b32 s5, s[2:3], 0x0
-; GFX1232-TRUE16-NEXT: s_and_b32 s9, s4, 0xffff0000
-; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s4, 16
-; GFX1232-TRUE16-NEXT: s_mov_b32 s4, s2
-; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s5
-; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s3
-; GFX1232-TRUE16-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1232-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1232-TRUE16-NEXT: v_add_f32_e32 v0, s10, v0
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_add_f32_e32 v2, s9, v2
-; GFX1232-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX1232-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1232-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1232-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1232-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
-; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2
-; GFX1232-TRUE16-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX1232-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s8
-; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB21_1
-; GFX1232-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX1232-TRUE16-NEXT: buffer_store_b32 v2, off, s[0:3], null
-; GFX1232-TRUE16-NEXT: s_endpgm
+; GFX1264-LABEL: uniform_fadd_v2bf16:
+; GFX1264: ; %bb.0:
+; GFX1264-NEXT: s_clause 0x1
+; GFX1264-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-NEXT: v_mov_b32_e32 v0, 0
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: s_wait_loadcnt 0x0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: s_endpgm
;
-; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16:
-; GFX1232-FAKE16: ; %bb.0:
-; GFX1232-FAKE16-NEXT: s_clause 0x1
-; GFX1232-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1232-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x34
-; GFX1232-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[10:11], 0x0
-; GFX1232-FAKE16-NEXT: s_lshl_b32 s2, s0, 16
-; GFX1232-FAKE16-NEXT: s_and_b32 s3, s0, 0xffff0000
-; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11
-; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s4
-; GFX1232-FAKE16-NEXT: s_mov_b32 s4, s10
-; GFX1232-FAKE16-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX1232-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1232-FAKE16-NEXT: v_add_f32_e32 v0, s2, v0
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_add_f32_e32 v2, s3, v2
-; GFX1232-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX1232-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX1232-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1232-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
-; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
-; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2
-; GFX1232-FAKE16-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffe
-; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB21_1
-; GFX1232-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1
-; GFX1232-FAKE16-NEXT: buffer_store_b32 v2, off, s[8:11], null
-; GFX1232-FAKE16-NEXT: s_endpgm
- %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4
+; GFX1232-LABEL: uniform_fadd_v2bf16:
+; GFX1232: ; %bb.0:
+; GFX1232-NEXT: s_clause 0x1
+; GFX1232-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1232-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s2, -1
+; GFX1232-NEXT: s_wait_loadcnt 0x0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: s_endpgm
+ %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4, !amdgpu.no.fine.grained.memory !0
store <2 x bfloat> %rmw, ptr addrspace(1) %result
ret void
}
+
+!0 = !{}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX1132_DPP-FAKE16: {{.*}}
; GFX1132_DPP-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
new file mode 100644
index 0000000..5fc9f4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -0,0 +1,1486 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN:llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s
+
+define float @global_system_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_system_atomic_fadd_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @global_one_as_atomic_fadd_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_one_as_atomic_fadd_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @global_system_atomic_fadd_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_system_atomic_fadd_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @global_one_as_atomic_fadd_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_one_as_atomic_fadd_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define float @global_system_atomic_fmin_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_system_atomic_fmin_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @global_one_as_atomic_fmin_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_one_as_atomic_fmin_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @global_system_atomic_fmin_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_system_atomic_fmin_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @global_one_as_atomic_fmin_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_one_as_atomic_fmin_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define float @global_system_atomic_fmax_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_system_atomic_fmax_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @global_one_as_atomic_fmax_f32(ptr addrspace(1) %ptr, float %val) {
+; GFX1250-LABEL: global_one_as_atomic_fmax_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @global_system_atomic_fmax_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_system_atomic_fmax_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @global_one_as_atomic_fmax_f64(ptr addrspace(1) %ptr, double %val) {
+; GFX1250-LABEL: global_one_as_atomic_fmax_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_num_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define i32 @global_one_as_atomic_min_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_one_as_atomic_min_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr addrspace(1) %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @global_system_atomic_min_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_system_atomic_min_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr addrspace(1) %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @global_one_as_atomic_max_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_one_as_atomic_max_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr addrspace(1) %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @global_system_atomic_max_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_system_atomic_max_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_i32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr addrspace(1) %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @global_one_as_atomic_umin_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umin_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @global_system_atomic_umin_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_system_atomic_umin_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr addrspace(1) %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @global_one_as_atomic_umax_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umax_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @global_system_atomic_umax_i32(ptr addrspace(1) %ptr, i32 %val) {
+; GFX1250-LABEL: global_system_atomic_umax_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr addrspace(1) %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i64 @global_one_as_atomic_min_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_one_as_atomic_min_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr addrspace(1) %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @global_system_atomic_min_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_system_atomic_min_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr addrspace(1) %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @global_one_as_atomic_max_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_one_as_atomic_max_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr addrspace(1) %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @global_system_atomic_max_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_system_atomic_max_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_i64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr addrspace(1) %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @global_one_as_atomic_umin_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umin_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @global_system_atomic_umin_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_system_atomic_umin_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_min_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr addrspace(1) %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @global_one_as_atomic_umax_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umax_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @global_system_atomic_umax_i64(ptr addrspace(1) %ptr, i64 %val) {
+; GFX1250-LABEL: global_system_atomic_umax_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_atomic_max_u64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr addrspace(1) %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
+; GFX1250-LABEL: global_one_as_atomic_min_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_min_i16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB28_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umin_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_min_u16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB29_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
+; GFX1250-LABEL: global_one_as_atomic_max_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_max_i16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB30_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
+; GFX1250-LABEL: global_one_as_atomic_umax_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_max_u16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB31_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr addrspace(1) %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define float @flat_system_atomic_fadd_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_system_atomic_fadd_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @flat_one_as_atomic_fadd_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fadd_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_system_atomic_fadd_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB34_6
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execz .LBB34_3
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB34_3: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_cbranch_execz .LBB34_5
+; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB34_5: ; %Flow1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB34_6: ; %Flow2
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB34_8
+; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3]
+; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fadd_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB35_6
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execz .LBB35_3
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB35_3: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_cbranch_execz .LBB35_5
+; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB35_5: ; %Flow1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB35_6: ; %Flow2
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB35_8
+; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3]
+; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.phi
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define float @flat_system_atomic_fmin_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_system_atomic_fmin_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @flat_one_as_atomic_fmin_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fmin_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_system_atomic_fmin_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB38_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB38_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB38_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fmin_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB39_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB39_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB39_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define float @flat_system_atomic_fmax_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_system_atomic_fmax_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr %ptr, float %val monotonic
+ ret float %result
+}
+
+define float @flat_one_as_atomic_fmax_f32(ptr %ptr, float %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fmax_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr %ptr, float %val syncscope("one-as") monotonic
+ ret float %result
+}
+
+define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_system_atomic_fmax_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB42_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB42_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB42_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr %ptr, double %val monotonic
+ ret double %result
+}
+
+define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) {
+; GFX1250-LABEL: flat_one_as_atomic_fmax_f64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB43_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB43_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB43_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
+; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define i32 @flat_one_as_atomic_min_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_min_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @flat_system_atomic_min_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_system_atomic_min_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @flat_one_as_atomic_max_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_max_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @flat_system_atomic_max_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_system_atomic_max_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_i32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @flat_one_as_atomic_umin_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umin_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @flat_system_atomic_umin_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_system_atomic_umin_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_min_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i32 @flat_one_as_atomic_umax_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umax_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr %ptr, i32 %val syncscope("one-as") monotonic
+ ret i32 %result
+}
+
+define i32 @flat_system_atomic_umax_i32(ptr %ptr, i32 %val) {
+; GFX1250-LABEL: flat_system_atomic_umax_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_max_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr %ptr, i32 %val monotonic
+ ret i32 %result
+}
+
+define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_min_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB52_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB52_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB52_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_min_i64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB52_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_system_atomic_min_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB53_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB53_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB53_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_min_i64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB53_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_max_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB54_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB54_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB54_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_i64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB54_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_system_atomic_max_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB55_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_i64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB55_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB55_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_i64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB55_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umin_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB56_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB56_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB56_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_min_u64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB56_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_system_atomic_umin_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB57_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_min_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB57_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB57_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_min_u64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB57_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umax_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB58_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB58_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB58_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_u64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB58_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr %ptr, i64 %val syncscope("one-as") monotonic
+ ret i64 %result
+}
+
+define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
+; GFX1250-LABEL: flat_system_atomic_umax_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
+; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB59_2
+; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-NEXT: flat_atomic_max_u64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-NEXT: .LBB59_2: ; %Flow
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_cbranch_execz .LBB59_4
+; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
+; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_max_u64 v[0:1], v[4:5], v[2:3]
+; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT: .LBB59_4: ; %atomicrmw.phi
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr %ptr, i64 %val monotonic
+ ret i64 %result
+}
+
+define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_min_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: flat_load_b32 v5, v[0:1]
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_min_i16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB60_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw min ptr %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umin_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: flat_load_b32 v5, v[0:1]
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_min_u16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB61_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umin ptr %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_max_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: flat_load_b32 v5, v[0:1]
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_max_i16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB62_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw max ptr %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
+
+define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
+; GFX1250-LABEL: flat_one_as_atomic_umax_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-NEXT: s_mov_b32 s0, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: flat_load_b32 v5, v[0:1]
+; GFX1250-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-NEXT: v_max_u16 v5, v5, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB63_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %result = atomicrmw umax ptr %ptr, i16 %val syncscope("one-as") monotonic
+ ret i16 %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 887f489..3266fde 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr i32, ptr %addr, i32 -4
+ %gep = getelementptr inbounds i32, ptr %addr, i32 -4
%unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
ret void
}
@@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr i32, ptr %addr, i32 -4
+ %gep = getelementptr inbounds i32, ptr %addr, i32 -4
%unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
ret void
}
@@ -83,7 +83,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr i32, ptr %addr, i32 4
+ %gep = getelementptr inbounds i32, ptr %addr, i32 4
%val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
store i32 %val, ptr %use
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
index 366432e0..6ec6dce 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-max-num-workgroups-propagate.ll
@@ -223,16 +223,16 @@ define amdgpu_kernel void @kernel_explicit_worst_case() #9 {
attributes #9 = {"amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295"}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="1,2,3" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="100,10,99" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="100,8,32" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-max-num-workgroups"="16,10,99" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-max-num-workgroups"="4294967295,1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-max-num-workgroups"="1,4294967295,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-max-num-workgroups"="1,1,4294967295" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-max-num-workgroups"="42,99,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-max-num-workgroups"="256,128,1024" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-max-num-workgroups"="256,128,2048" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-max-num-workgroups"="4294967295,4294967295,4294967295" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
index 93ebeaf..fcca3d7 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll
@@ -13,13 +13,13 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 {
; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0:![0-9]+]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -31,13 +31,13 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(
; GFX9-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX9-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]]
; GFX9-NEXT: ret void
;
; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(
; GFX10-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
; GFX10-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; GFX10-NEXT: store volatile i32 0, ptr [[STOF]], align 4, !noalias.addrspace [[META0]]
; GFX10-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -147,9 +147,13 @@ define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %
attributes #0 = { "amdgpu-no-flat-scratch-init" }
;.
-; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; GFX9: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" }
;.
-; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
; GFX10: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
;.
+; GFX9: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+;.
+; GFX10: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index b610f11..009dec8 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -865,19 +865,19 @@ define amdgpu_kernel void @with_inline_asm() {
}
;.
-; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
; GFX9: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx900" }
-; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
;.
-; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
; GFX10: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1010" }
-; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
;.
; GFX9: [[META0]] = !{i32 2, i32 10}
; GFX9: [[META1]] = !{i32 1, i32 2, i32 3, i32 10}
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
index 2b9f579..a9efcdc 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
@@ -63,5 +63,5 @@ define amdgpu_kernel void @entry() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
new file mode 100644
index 0000000..3704012
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=18 < %s | FileCheck %s
+
+define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, i1 %cond.i.i.i2295, ptr addrspace(1) %ptr, ptr %ptr1) #0 {
+; CHECK-LABEL: vgpr_mfma_pass_av_split_crash:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0
+; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6
+; CHECK-NEXT: v_mov_b32_e32 v20, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_bitcmp1_b32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0
+; CHECK-NEXT: s_xor_b64 s[18:19], s[16:17], -1
+; CHECK-NEXT: s_bitcmp1_b32 s0, 8
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v1
+; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1
+; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v0
+; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6
+; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f
+; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90
+; CHECK-NEXT: v_mov_b32_e32 v5, 0x3efa01a0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x16c16967
+; CHECK-NEXT: v_mov_b32_e32 v7, 0xbf56c16c
+; CHECK-NEXT: v_mov_b32_e32 v8, 0x69efb384
+; CHECK-NEXT: v_mov_b32_e32 v9, 0x3f4b2bb0
+; CHECK-NEXT: v_mov_b32_e32 v10, 0xa57d9582
+; CHECK-NEXT: v_mov_b32_e32 v11, 0xbf8c6ea4
+; CHECK-NEXT: v_mov_b32_e32 v12, 0xe82d3ff0
+; CHECK-NEXT: v_mov_b32_e32 v13, 0xbfa59976
+; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883
+; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4
+; CHECK-NEXT: s_mov_b64 s[22:23], 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136
+; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17]
+; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523
+; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555
+; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19]
+; CHECK-NEXT: v_mov_b32_e32 v21, v20
+; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31
+; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23
+; CHECK-NEXT: s_branch .LBB0_2
+; CHECK-NEXT: .LBB0_1: ; %Flow9
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_andn2_b64 vcc, exec, s[24:25]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_17
+; CHECK-NEXT: .LBB0_2: ; %._crit_edge1942.i.i.i3548
+; CHECK-NEXT: ; =>This Loop Header: Depth=1
+; CHECK-NEXT: ; Child Loop BB0_6 Depth 2
+; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_9
+; CHECK-NEXT: ; %bb.3: ; %.preheader1868.i.i.i3244
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 vcc, s[4:5]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_10
+; CHECK-NEXT: ; %bb.4: ; %.preheader1855.i.i.i3329.preheader
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15]
+; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25]
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a2
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3]
+; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, 0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, 0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25]
+; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[4:5]
+; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[6:7]
+; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[8:9]
+; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[10:11]
+; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[12:13]
+; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15]
+; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27]
+; CHECK-NEXT: v_fmac_f64_e32 v[16:17], 0, v[28:29]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19]
+; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[16:17]
+; CHECK-NEXT: s_branch .LBB0_6
+; CHECK-NEXT: .LBB0_5: ; %Flow
+; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9]
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_11
+; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329
+; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1
+; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a0
+; CHECK-NEXT: s_mov_b64 s[24:25], -1
+; CHECK-NEXT: s_mov_b64 s[8:9], -1
+; CHECK-NEXT: s_mov_b64 vcc, s[2:3]
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1
+; CHECK-NEXT: s_cbranch_vccz .LBB0_5
+; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291
+; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v30
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v31
+; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19]
+; CHECK-NEXT: s_mov_b64 vcc, s[6:7]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_5
+; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325
+; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v26
+; CHECK-NEXT: s_mov_b64 s[24:25], 0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v27
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: s_branch .LBB0_5
+; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 s[22:23], 0
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21]
+; CHECK-NEXT: s_branch .LBB0_15
+; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 s[8:9], -1
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0
+; CHECK-NEXT: s_branch .LBB0_15
+; CHECK-NEXT: .LBB0_11: ; %loop.exit.guard
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_13
+; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29]
+; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17]
+; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17]
+; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9]
+; CHECK-NEXT: v_mov_b32_e32 v17, v16
+; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17]
+; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13]
+; CHECK-NEXT: s_cselect_b32 s23, s23, 0
+; CHECK-NEXT: s_cselect_b32 s22, s22, 0
+; CHECK-NEXT: s_mov_b64 s[8:9], -1
+; CHECK-NEXT: s_branch .LBB0_14
+; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0
+; CHECK-NEXT: .LBB0_14: ; %Flow6
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25]
+; CHECK-NEXT: .LBB0_15: ; %Flow6
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 s[24:25], -1
+; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9]
+; CHECK-NEXT: s_cbranch_vccz .LBB0_1
+; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330
+; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT: s_mov_b64 s[24:25], 0
+; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13]
+; CHECK-NEXT: s_branch .LBB0_1
+; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock
+; CHECK-NEXT: s_endpgm
+entry:
+ br label %._crit_edge1942.i.i.i3548
+
+._crit_edge1942.i.i.i3548: ; preds = %._crit_edge2105.i.i.i2330, %entry
+ %.sroa.02591.0.i.i.i226323 = phi double [ poison, %entry ], [ %.sroa.02591.3.i.i.i2301, %._crit_edge2105.i.i.i2330 ]
+ %.sroa.3.0.i.i.i2270 = phi double [ poison, %entry ], [ %.sroa.3.3.i.i.i2308, %._crit_edge2105.i.i.i2330 ]
+ %.014942244.i.i.i2280 = phi double [ 0.000000e+00, %entry ], [ %.31497.i.i.i2317, %._crit_edge2105.i.i.i2330 ]
+ br i1 %cond.i.i.i2295, label %.preheader1868.i.i.i3244, label %._crit_edge2105.i.i.i2330
+
+.preheader1868.i.i.i3244: ; preds = %._crit_edge1942.i.i.i3548
+ %i = load double, ptr %ptr1, align 8
+ %i3 = call double @llvm.fma.f64(double %i, double 0.000000e+00, double 0x3E21EEB69037AB78)
+ %i4 = call double @llvm.fma.f64(double 0.000000e+00, double %i3, double 0xBE927E4FA17F65F6)
+ %i5 = call double @llvm.fma.f64(double 0.000000e+00, double %i4, double 0x3EFA01A019F4EC90)
+ %i6 = call double @llvm.fma.f64(double 0.000000e+00, double %i5, double 0xBF56C16C16C16967)
+ %spec.select.i.i.i3288 = select i1 %arg2, double 0.000000e+00, double %.sroa.3.0.i.i.i2270
+ br i1 %arg2, label %.preheader1855.i.i.i3329, label %._crit_edge2105.i.i.i2330
+
+.lr.ph2070.i.i.i3291: ; preds = %.preheader1855.i.i.i3329
+ br i1 %arg2, label %.preheader1855.i.i.i3329, label %.preheader1856.preheader.i.i.i3325
+
+.preheader1856.preheader.i.i.i3325: ; preds = %.lr.ph2070.i.i.i3291
+ %i11 = call double @llvm.fma.f64(double 0.000000e+00, double %i6, double 0x3F4B2BB069EFB384)
+ %i14 = call double @llvm.fma.f64(double 0.000000e+00, double %i11, double 0xBF8C6EA4A57D9582)
+ %i18 = call double @llvm.fma.f64(double 0.000000e+00, double %i14, double 0xBFA59976E82D3FF0)
+ %i21 = call double @llvm.fma.f64(double 0.000000e+00, double %i18, double 0x3FAE1BB48427B883)
+ %i23 = call double @llvm.fma.f64(double 0.000000e+00, double %i21, double 0x3FB3B13657B87036)
+ %i28 = call double @llvm.fma.f64(double 0.000000e+00, double %i23, double 0xBFD5555555555523)
+ br label %.preheader1855.i.i.i3329
+
+.preheader1855.i.i.i3329: ; preds = %.preheader1856.preheader.i.i.i3325, %.lr.ph2070.i.i.i3291, %.preheader1868.i.i.i3244
+ %.sroa.02591.4.i.i.i3335 = phi double [ %i28, %.preheader1856.preheader.i.i.i3325 ], [ %.sroa.02591.0.i.i.i226323, %.lr.ph2070.i.i.i3291 ], [ 0.000000e+00, %.preheader1868.i.i.i3244 ]
+ %.21496.ph.i.i.i3348 = select i1 %arg2, double %.014942244.i.i.i2280, double 0.000000e+00
+ %i31 = fcmp one double %.sroa.02591.4.i.i.i3335, 0.000000e+00
+ %i32 = select i1 %i31, <2 x i32> zeroinitializer, <2 x i32> splat (i32 1)
+ store <2 x i32> %i32, ptr addrspace(1) %ptr, align 8
+ br i1 %cond.i.i.i2295, label %.lr.ph2070.i.i.i3291, label %._crit_edge2105.i.i.i2330
+
+._crit_edge2105.i.i.i2330: ; preds = %.preheader1855.i.i.i3329, %.preheader1868.i.i.i3244, %._crit_edge1942.i.i.i3548
+ %.sroa.02591.3.i.i.i2301 = phi double [ %.sroa.02591.0.i.i.i226323, %.preheader1868.i.i.i3244 ], [ %arg1, %._crit_edge1942.i.i.i3548 ], [ %i, %.preheader1855.i.i.i3329 ]
+ %.sroa.3.3.i.i.i2308 = phi double [ 0.000000e+00, %.preheader1868.i.i.i3244 ], [ %.sroa.3.0.i.i.i2270, %._crit_edge1942.i.i.i3548 ], [ %spec.select.i.i.i3288, %.preheader1855.i.i.i3329 ]
+ %.31497.i.i.i2317 = phi double [ %.014942244.i.i.i2280, %.preheader1868.i.i.i3244 ], [ 0.000000e+00, %._crit_edge1942.i.i.i3548 ], [ %.21496.ph.i.i.i3348, %.preheader1855.i.i.i3329 ]
+ store double 0.000000e+00, ptr addrspace(1) %ptr, align 8
+ br label %._crit_edge1942.i.i.i3548
+}
+
+declare double @llvm.fma.f64(double, double, double) #1
+
+attributes #0 = { "amdgpu-waves-per-eu"="8,8" "target-cpu"="gfx942" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index 1f9d490..272997c 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=postrapseudos %s -o - | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=postrapseudos %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=postrapseudos %s -o - | FileCheck -check-prefixes=CHECK,GFX908 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=postrapseudos %s -o - | FileCheck -check-prefixes=CHECK,GFX90A %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=postrapseudos %s -o - | FileCheck -check-prefixes=CHECK,GFX942 %s
---
name: av_mov_b32_imm_pseudo_agpr_0
@@ -54,3 +55,134 @@ body: |
; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
$agpr1 = AV_MOV_B32_IMM_PSEUDO $vgpr0, implicit $exec
...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_0
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_neg1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg1
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -1, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO -1, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_vgpr_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
+ ; GFX908: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ;
+ ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
+ ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec
+ ;
+ ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_0
+ ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 0, implicit $exec
+ $vgpr0_vgpr1 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_vgpr_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
+ ; GFX908: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ;
+ ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
+ ; GFX90A: $vgpr0 = V_MOV_B32_e32 64, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ;
+ ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_64
+ ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 64, implicit $exec
+ $vgpr0_vgpr1 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_64_hi_0_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_0_lo
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906944, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_64_hi_2_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_64_hi_2_lo
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 274877906946, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_agpr_neg16_hi_9_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: av_mov_b64_imm_pseudo_agpr_neg16_hi_9_lo
+ ; CHECK: $agpr0 = V_ACCVGPR_WRITE_B32_e64 9, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec, implicit-def $agpr0_agpr1
+ $agpr0_agpr1 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
+...
+
+---
+name: av_mov_b64_imm_pseudo_vgpr_inv2pi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX908-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
+ ; GFX908: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4_vgpr5
+ ; GFX908-NEXT: $vgpr5 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr4_vgpr5
+ ;
+ ; GFX90A-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
+ ; GFX90A: $vgpr0 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
+ ;
+ ; GFX942-LABEL: name: av_mov_b64_imm_pseudo_vgpr_inv2pi
+ ; GFX942: $vgpr0_vgpr1 = V_MOV_B64_e32 1042479491, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX942-NEXT: $vgpr3 = V_MOV_B32_e32 1042479491, implicit $exec, implicit-def $vgpr2_vgpr3
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, 1042479491, 8, 1042479491, 0, 0, 0, 0, 0, implicit $exec
+ $vgpr0_vgpr1 = AV_MOV_B64_IMM_PSEUDO 1042479491, implicit $exec
+ $vgpr2_vgpr3 = AV_MOV_B64_IMM_PSEUDO 4477415320595726336, implicit $exec
+ $vgpr4_vgpr5 = AV_MOV_B64_IMM_PSEUDO 4477415321638205827, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index b584f6d..122e683 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -54,9 +54,8 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
; GFX11-BACKOFF: ; %bb.0:
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
-; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-BACKOFF-NEXT: s_barrier
+; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-BACKOFF-NEXT: flat_store_b32 v[2:3], v0
; GFX11-BACKOFF-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-BACKOFF-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 752a87a..02ead57 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.cvt = fptrunc float %a to bfloat
@@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.abs = call float @llvm.fabs.f32(float %a)
@@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.neg = fneg float %a
@@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.cvt = fptrunc double %a to bfloat
@@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.neg = fneg double %a
@@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: flat_store_b16 v[2:3], v0
+; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
entry:
%a.abs = call double @llvm.fabs.f64(double %a)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 9979e83..30a7864 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -368,10 +368,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
; GCN-LABEL: test_clamp_bf16_folding:
; GCN: ; %bb.0:
-; GCN-NEXT: v_exp_bf16_e32 v0, v0
-; GCN-NEXT: v_nop
-; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
; GCN-NEXT: ; return to shader part epilog
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
@@ -382,9 +379,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
; GCN-LABEL: test_clamp_v2bf16_folding:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
; GCN-NEXT: ; return to shader part epilog
%mul = fmul <2 x bfloat> %src0, %src1
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
@@ -396,9 +391,7 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
@@ -410,9 +403,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
@@ -424,9 +415,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
+; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
@@ -438,9 +429,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
+; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, %b
@@ -452,9 +441,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
; GCN: ; %bb.0:
-; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
+; GCN-NEXT: s_mov_b32 s0, 0x43484000
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
; GCN-NEXT: global_store_b32 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 52e697c..10e523d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -24671,7 +24671,6 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
ret <32 x bfloat> %op
}
-
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
@@ -29673,7 +29672,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
ret { bfloat, i16 } %op
}
-
declare bfloat @llvm.log.bf16(bfloat)
declare bfloat @llvm.log2.bf16(bfloat)
declare bfloat @llvm.log10.bf16(bfloat)
@@ -37776,9 +37774,10 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX11TRUE16-LABEL: v_uitofp_i16_to_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v1
; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -40752,12 +40751,11 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
;
; GFX11TRUE16-LABEL: s_select_bf16:
; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v1.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -47043,18 +47041,10 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX8-LABEL: v_fmuladd_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
@@ -47067,20 +47057,13 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX900-LABEL: v_fmuladd_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX900-NEXT: s_movk_i32 s4, 0x7fff
; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
@@ -47090,35 +47073,25 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX950-LABEL: v_fmuladd_bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -47126,55 +47099,38 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX11FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -47235,39 +47191,22 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX8-LABEL: v_fmuladd_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v5, v4, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
@@ -47279,36 +47218,22 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX900-LABEL: v_fmuladd_v2bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX900-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
-; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v5, v4, v3
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4
; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
-; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
-; GFX900-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
@@ -47319,150 +47244,94 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX950-LABEL: v_fmuladd_v2bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX950-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX950-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
+; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
+; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -47542,57 +47411,33 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX8-LABEL: v_fmuladd_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v6, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
@@ -47605,52 +47450,31 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX900-LABEL: v_fmuladd_v3bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX900-NEXT: s_movk_i32 s4, 0x7fff
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v6, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
@@ -47662,211 +47486,132 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX950-LABEL: v_fmuladd_v3bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX950-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5
+; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_mul_f32_e32 v3, v7, v6
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
+; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11TRUE16-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11TRUE16-NEXT: v_dual_fmac_f32 v4, v0, v2 :: v_dual_fmac_f32 v5, v1, v3
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6
+; GFX11TRUE16-NEXT: v_bfe_u32 v0, v6, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11TRUE16-NEXT: v_add3_u32 v8, v9, v1, 0x7fff
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v2, v3
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v1, v8, v6 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v6, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v3, v5, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v3, v7, v6
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
-; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
-; GFX11FAKE16-NEXT: v_bfe_u32 v3, v2, 16, 1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1
-; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
@@ -47966,75 +47711,43 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX8-LABEL: v_fmuladd_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX8-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v5
-; GFX8-NEXT: v_add_f32_e32 v6, v6, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX8-NEXT: v_fma_f32 v6, v8, v7, v6
; GFX8-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX8-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_fma_f32 v3, v7, v5, v3
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
@@ -48048,68 +47761,40 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX900-LABEL: v_fmuladd_v4bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX900-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX900-NEXT: s_movk_i32 s4, 0x7fff
-; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
-; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5
-; GFX900-NEXT: v_add_f32_e32 v6, v6, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX900-NEXT: v_fma_f32 v6, v8, v7, v6
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1
-; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5
; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4
; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v1
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc
-; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
-; GFX900-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
-; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
-; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4
-; GFX900-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX900-NEXT: v_fma_f32 v3, v7, v5, v3
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4
; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4
; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc
; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
-; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
-; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
-; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
@@ -48121,264 +47806,162 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX950-LABEL: v_fmuladd_v4bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX950-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v5
-; GFX950-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX950-NEXT: v_add_f32_e32 v6, v6, v7
-; GFX950-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX950-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX10-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
+; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
+; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v0, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX10-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX10-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
-; GFX10-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
+; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v5
-; GFX11TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_dual_mul_f32 v3, v9, v7 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
-; GFX11TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0
-; GFX11TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v9, v11 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_add_f32_e32 v2, v6, v8
-; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v8
+; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11TRUE16-NEXT: v_fmac_f32_e32 v1, v0, v2
+; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v5, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v0, v7, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
+; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v4, v9, v6, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v7, v6 :: v_dual_lshlrev_b32 v6, 16, v4
-; GFX11TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_dual_add_f32 v0, v0, v6 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11TRUE16-NEXT: v_add3_u32 v6, v7, v2, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v1, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc_lo
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v3, 0x7fff
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v3
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v2
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
+; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11FAKE16-NEXT: v_dual_mul_f32 v1, v1, v3 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
+; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
+; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11FAKE16-NEXT: v_mul_f32_e32 v7, v9, v7
-; GFX11FAKE16-NEXT: v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v9, v7, 16, 1
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX11FAKE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11FAKE16-NEXT: v_bfe_u32 v11, v0, 16, 1
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_add3_u32 v11, v11, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX11FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
+; GFX11FAKE16-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v2, v9, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc_lo
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11FAKE16-NEXT: v_bfe_u32 v6, v1, 16, 1
-; GFX11FAKE16-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v3
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11FAKE16-NEXT: v_add3_u32 v4, v7, v3, 0x7fff
-; GFX11FAKE16-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11FAKE16-NEXT: v_add3_u32 v4, v6, v1, 0x7fff
-; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v2, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11FAKE16-NEXT: v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11FAKE16-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index ba818f6..187f19f 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-SDAG,GFX1250-SDAG-FAKE16,GFX1250-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-SDAG,GFX1250-SDAG-TRUE16,GFX1250-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-GISEL,GFX1250-GISEL-FAKE16,GFX1250-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-GISEL,GFX1250-GISEL-TRUE16,GFX1250-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-GISEL,GFX1250-GISEL-FAKE16,GFX1250-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-- -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-GISEL,GFX1250-GISEL-TRUE16,GFX1250-TRUE16 %s
; ========= Single bit functions =========
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 2761cba..bfef88c 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -197,7 +197,7 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3800
; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
@@ -303,7 +303,7 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3800
; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
index 96b191d..2ce54f8a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f64_offset_no_rtn(double %val, <4 x i3
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -40,7 +40,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f64_offen_no_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -62,7 +62,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f64_idxen_no_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -86,7 +86,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f64_bothen_no_rtn(double %val, <4 x i3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
@@ -107,7 +107,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offset_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY7]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
@@ -135,7 +135,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_offen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
@@ -163,7 +163,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_idxen_rtn(double %val, <4 x i32>
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY8]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
@@ -193,7 +193,7 @@ define amdgpu_ps double @buffer_atomic_fadd_f64_bothen_rtn(double %val, <4 x i32
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY9]], killed [[REG_SEQUENCE2]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
@@ -226,7 +226,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offset_no_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
@@ -254,7 +254,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_offen_no_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_OFFEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -282,7 +282,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_idxen_no_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_IDXEN killed [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -312,7 +312,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f64_bothen_no_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F64_BOTHEN killed [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
@@ -339,7 +339,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offset_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFSET_RTN [[COPY11]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFSET_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
@@ -373,7 +373,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_offen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_OFFEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_OFFEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
@@ -407,7 +407,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_idxen_rtn(double %val, ptr a
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY12:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_IDXEN_RTN [[COPY12]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_IDXEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
@@ -443,7 +443,7 @@ define amdgpu_ps double @buffer_ptr_atomic_fadd_f64_bothen_rtn(double %val, ptr
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE3]]
+ ; GFX90A_GFX942-NEXT: [[COPY13:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE3]]
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN:%[0-9]+]]:vreg_64_align2 = BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN [[COPY13]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
; GFX90A_GFX942-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_F64_BOTHEN_RTN]].sub0
; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY14]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index f4b432d..0ceb901 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -3443,15 +3443,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -3569,14 +3568,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -3884,15 +3882,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -4007,14 +4004,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -4328,15 +4324,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
@@ -4556,15 +4551,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 6f1675e..cad4c39 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -2640,20 +2640,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
-; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -2973,16 +2972,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -3098,20 +3097,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
-; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -3437,16 +3435,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3672,16 +3670,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index acb27be..6275afd 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -2640,20 +2640,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
-; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -2973,16 +2972,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
@@ -3098,20 +3097,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
; GFX11-TRUE16-NEXT: s_not_b32 s6, s5
; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0
-; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
@@ -3437,16 +3435,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
@@ -3672,16 +3670,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5
; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll
index efaee6f..146010c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsic-mmo-type.ll
@@ -48,10 +48,6 @@ define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x hal
; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY12]], %subreg.sub0, killed [[COPY11]], %subreg.sub1, killed [[COPY10]], %subreg.sub2, killed [[COPY9]], %subreg.sub3
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GCN-NEXT: [[COPY13:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
diff --git a/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
new file mode 100644
index 0000000..5c7dc8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bundle-breaks-phy-liveness.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-insert-hard-clauses -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN-CLAUSE %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-post-ra-bundler -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN-BUNDLE %s
+
+---
+name: clause_implicit_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr32
+
+ ; GCN-CLAUSE-LABEL: name: clause_implicit_def
+ ; GCN-CLAUSE: liveins: $vgpr0, $sgpr32
+ ; GCN-CLAUSE-NEXT: {{ $}}
+ ; GCN-CLAUSE-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $sgpr32, implicit $exec, implicit $flat_scr {
+ ; GCN-CLAUSE-NEXT: S_CLAUSE 1
+ ; GCN-CLAUSE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+ ; GCN-CLAUSE-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; GCN-CLAUSE-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; GCN-CLAUSE-NEXT: SCRATCH_STORE_DWORDX2_SADDR internal $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
+ ; GCN-CLAUSE-NEXT: }
+ ;
+ ; GCN-BUNDLE-LABEL: name: clause_implicit_def
+ ; GCN-BUNDLE: liveins: $vgpr0, $sgpr32
+ ; GCN-BUNDLE-NEXT: {{ $}}
+ ; GCN-BUNDLE-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $sgpr32, implicit $exec, implicit $flat_scr {
+ ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+ ; GCN-BUNDLE-NEXT: $vgpr2 = IMPLICIT_DEF
+ ; GCN-BUNDLE-NEXT: $vgpr3 = IMPLICIT_DEF
+ ; GCN-BUNDLE-NEXT: SCRATCH_STORE_DWORDX2_SADDR internal $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
+ ; GCN-BUNDLE-NEXT: }
+ SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = IMPLICIT_DEF
+ $vgpr3 = IMPLICIT_DEF
+ SCRATCH_STORE_DWORDX2_SADDR $vgpr2_vgpr3, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 2a1be99..b8dd377 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -426,16 +426,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_call_external_void_func_i8_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
+; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
+; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4
+; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12
+; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i8_imm:
; HSA: ; %bb.0:
@@ -723,16 +734,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_call_external_void_func_i16_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
+; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
+; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4
+; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12
+; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_i16_imm:
; HSA: ; %bb.0:
@@ -1642,16 +1664,27 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_call_external_void_func_f16_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
-; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT: s_mov_b32 s32, 0
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400
+; GFX11-TRUE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
+; GFX11-TRUE16-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
+; GFX11-TRUE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_f16_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX11-FAKE16-NEXT: s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4
+; GFX11-FAKE16-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12
+; GFX11-FAKE16-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_void_func_f16_imm:
; HSA: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index ff80250..ddd3b152 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-FAKE16 %s
; Make sure we don't crash or assert on spir_kernel calling convention.
@@ -34,6 +36,14 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: kernel:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
store i32 0, ptr addrspace(1) %out
ret void
@@ -70,6 +80,16 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: ps_ret_cc_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: ps_ret_cc_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -96,26 +116,71 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, 1.0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_ret_cc_inreg_f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_f16 s0, s0, 1.0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
define fastcc float @fastcc(float %arg0) #0 {
-; GCN-LABEL: fastcc:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: fastcc:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, 4.0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: fastcc:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, 4.0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fastcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: fastcc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%add = fadd float %arg0, 4.0
ret float %add
}
define coldcc float @coldcc(float %arg0) #0 {
-; GCN-LABEL: coldcc:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: coldcc:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, 4.0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: coldcc:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, 4.0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: coldcc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: coldcc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%add = fadd float %arg0, 4.0
ret float %add
}
@@ -209,6 +274,23 @@ define amdgpu_kernel void @call_coldcc() #0 {
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: call_coldcc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1250-NEXT: s_get_pc_i64 s[6:7]
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], coldcc@gotpcrel+4
+; GFX1250-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
+; GFX1250-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], 36
+; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s32, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
%val = call coldcc float @coldcc(float 1.0)
store float %val, ptr addrspace(1) poison
ret void
@@ -303,6 +385,23 @@ define amdgpu_kernel void @call_fastcc() #0 {
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: call_fastcc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX1250-NEXT: s_get_pc_i64 s[6:7]
+; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], fastcc@gotpcrel+4
+; GFX1250-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
+; GFX1250-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
+; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], 36
+; GFX1250-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX1250-NEXT: s_mov_b32 s32, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
%val = call fastcc float @fastcc(float 1.0)
store float %val, ptr addrspace(1) poison
ret void
@@ -331,6 +430,16 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: cs_mesa:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: cs_mesa:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -358,6 +467,16 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: ps_mesa_f16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: ps_mesa_f16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -385,6 +504,16 @@ define amdgpu_vs half @vs_mesa(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: vs_mesa:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: vs_mesa:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -412,6 +541,16 @@ define amdgpu_gs half @gs_mesa(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: gs_mesa:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: gs_mesa:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -439,6 +578,16 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: hs_mesa:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: hs_mesa:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT: ; return to shader part epilog
%add = fadd half %arg0, 1.0
ret half %add
}
@@ -468,6 +617,11 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_v2f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: ; return to shader part epilog
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
ret <2 x half> %add
}
@@ -497,6 +651,11 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_inreg_v2f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: ; return to shader part epilog
%add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
ret <2 x half> %add
}
@@ -528,6 +687,12 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
%add = add <2 x i16> %arg0, <i16 1, i16 1>
store <2 x i16> %add, ptr addrspace(1) poison
ret void
@@ -563,6 +728,12 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
; GFX11-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
%add = add <2 x i16> %arg0, <i16 1, i16 1>
store <2 x i16> %add, ptr addrspace(1) poison
ret void
@@ -603,6 +774,12 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_v4f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: ; return to shader part epilog
%add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
ret <4 x half> %add
}
@@ -644,6 +821,12 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_inreg_v4f16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT: ; return to shader part epilog
%add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
ret <4 x half> %add
}
@@ -685,6 +868,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
+; GFX1250-NEXT: s_add_co_i32 s0, s0, 1
+; GFX1250-NEXT: s_add_co_i32 s1, s1, 2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT: s_endpgm
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
store <3 x i32> %add, ptr addrspace(1) poison
ret void
@@ -717,6 +911,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0
; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v3f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_f32 s0, s0, 1.0
+; GFX1250-NEXT: s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT: s_add_f32 s2, s2, 4.0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT: s_endpgm
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
store <3 x float> %add, ptr addrspace(1) poison
ret void
@@ -772,6 +977,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v5i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_co_i32 s3, s3, 4
+; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
+; GFX1250-NEXT: s_add_co_i32 s1, s1, 2
+; GFX1250-NEXT: s_add_co_i32 s4, s4, 5
+; GFX1250-NEXT: s_add_co_i32 s0, s0, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT: s_endpgm
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
store <5 x i32> %add, ptr addrspace(1) poison
ret void
@@ -813,6 +1034,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v5f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_f32 s3, s3, -1.0
+; GFX1250-NEXT: s_add_f32 s4, s4, 0.5
+; GFX1250-NEXT: s_add_f32 s0, s0, 1.0
+; GFX1250-NEXT: s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT: s_add_f32 s2, s2, 4.0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT: s_endpgm
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
store <5 x float> %add, ptr addrspace(1) poison
ret void
@@ -845,6 +1082,13 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_add_nc_u32 v2, 3, v2 :: v_dual_add_nc_u32 v1, 2, v1
+; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT: s_endpgm
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
store <3 x i32> %add, ptr addrspace(1) poison
ret void
@@ -876,6 +1120,13 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v3f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
+; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT: s_endpgm
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
store <3 x float> %add, ptr addrspace(1) poison
ret void
@@ -917,6 +1168,16 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v5i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
+; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
+; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT: s_endpgm
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
store <5 x i32> %add, ptr addrspace(1) poison
ret void
@@ -956,6 +1217,16 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v5f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
+; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
+; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT: s_endpgm
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
store <5 x float> %add, ptr addrspace(1) poison
ret void
@@ -987,6 +1258,18 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, v0, v0
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: ps_mesa_i16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.l
+; GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v0
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: ps_mesa_i16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, v0, v0
+; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT: s_endpgm
%add = add i16 %arg0, %arg0
store i16 %add, ptr addrspace(1) poison
ret void
@@ -1016,6 +1299,14 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
%add = add i16 %arg0, %arg0
store i16 %add, ptr addrspace(1) poison
ret void
@@ -1059,6 +1350,16 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add i8 %arg0, %arg0
store i8 %add, ptr addrspace(1) poison
@@ -1114,6 +1415,22 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v2i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s1, s0, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 8
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <2 x i8> %arg0, %arg0
store <2 x i8> %add, ptr addrspace(1) null
@@ -1199,6 +1516,32 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v4i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s1, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s2, s0, 24
+; GFX1250-NEXT: s_add_co_i32 s3, s0, s0
+; GFX1250-NEXT: s_bfe_u32 s0, s0, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT: s_lshl_b32 s0, s0, 8
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 8
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT: s_or_b32 s0, s3, s0
+; GFX1250-NEXT: s_or_b32 s1, s1, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <4 x i8> %arg0, %arg0
store <4 x i8> %add, ptr addrspace(1) null
@@ -1271,6 +1614,27 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
; GFX11-NEXT: global_store_b8 v[0:1], v4, off
; GFX11-NEXT: global_store_b16 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v3i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 2
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s2, s0, 0x80008
+; GFX1250-NEXT: s_lshr_b32 s1, s0, 16
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 8
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_or_b32 s0, s0, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b16 v[2:3], v5, off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <3 x i8> %arg0, %arg0
store <3 x i8> %add, ptr addrspace(1) null
@@ -1370,6 +1734,36 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
; GFX11-NEXT: global_store_b8 v[0:1], v4, off
; GFX11-NEXT: global_store_b32 v[2:3], v5, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v5i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s2, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s0
+; GFX1250-NEXT: s_bfe_u32 s0, s0, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT: s_lshl_b32 s0, s0, 8
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT: s_or_b32 s0, s4, s0
+; GFX1250-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_or_b32 s0, s0, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT: global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <5 x i8> %arg0, %arg0
store <5 x i8> %add, ptr addrspace(1) null
@@ -1505,6 +1899,48 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v8i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s2, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT: s_lshr_b32 s4, s1, 16
+; GFX1250-NEXT: s_lshr_b32 s5, s1, 24
+; GFX1250-NEXT: s_bfe_u32 s6, s0, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s7, s1, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT: s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT: s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT: s_lshl_b32 s6, s6, 8
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT: s_or_b32 s1, s1, s7
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_or_b32 s0, s0, s6
+; GFX1250-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s3, s4, 16
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT: s_or_b32 s1, s1, s3
+; GFX1250-NEXT: s_or_b32 s0, s0, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <8 x i8> %arg0, %arg0
store <8 x i8> %add, ptr addrspace(1) null
@@ -1740,6 +2176,81 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v16i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[4:5], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s6, s1, 16
+; GFX1250-NEXT: s_lshr_b32 s7, s1, 24
+; GFX1250-NEXT: s_lshr_b32 s8, s2, 16
+; GFX1250-NEXT: s_lshr_b32 s9, s2, 24
+; GFX1250-NEXT: s_lshr_b32 s10, s3, 16
+; GFX1250-NEXT: s_lshr_b32 s11, s3, 24
+; GFX1250-NEXT: s_lshr_b32 s4, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s5, s0, 24
+; GFX1250-NEXT: s_bfe_u32 s12, s0, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s13, s1, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s15, s3, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s11, s11, s11
+; GFX1250-NEXT: s_add_co_i32 s10, s10, s10
+; GFX1250-NEXT: s_add_co_i32 s9, s9, s9
+; GFX1250-NEXT: s_add_co_i32 s8, s8, s8
+; GFX1250-NEXT: s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_add_co_i32 s15, s15, s15
+; GFX1250-NEXT: s_add_co_i32 s14, s14, s14
+; GFX1250-NEXT: s_lshl_b32 s11, s11, 8
+; GFX1250-NEXT: s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT: s_lshl_b32 s9, s9, 8
+; GFX1250-NEXT: s_and_b32 s8, s8, 0xff
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_add_co_i32 s13, s13, s13
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT: s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s12, s12, s12
+; GFX1250-NEXT: s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT: s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT: s_lshl_b32 s15, s15, 8
+; GFX1250-NEXT: s_or_b32 s10, s10, s11
+; GFX1250-NEXT: s_lshl_b32 s11, s14, 8
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT: s_lshl_b32 s9, s13, 8
+; GFX1250-NEXT: s_or_b32 s6, s6, s7
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_lshl_b32 s7, s12, 8
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT: s_or_b32 s3, s3, s15
+; GFX1250-NEXT: s_or_b32 s2, s2, s11
+; GFX1250-NEXT: s_or_b32 s1, s1, s9
+; GFX1250-NEXT: s_or_b32 s0, s0, s7
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s10, s10, 16
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s8, s8, 16
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s4, s4, 16
+; GFX1250-NEXT: s_lshl_b32 s5, s6, 16
+; GFX1250-NEXT: s_or_b32 s3, s3, s10
+; GFX1250-NEXT: s_or_b32 s2, s2, s8
+; GFX1250-NEXT: s_or_b32 s0, s0, s4
+; GFX1250-NEXT: s_or_b32 s1, s1, s5
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <16 x i8> %arg0, %arg0
store <16 x i8> %add, ptr addrspace(1) null
@@ -2186,6 +2697,149 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v32i8:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16
+; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s16, s0, 16
+; GFX1250-NEXT: s_lshr_b32 s17, s0, 24
+; GFX1250-NEXT: s_lshr_b32 s20, s2, 16
+; GFX1250-NEXT: s_lshr_b32 s21, s2, 24
+; GFX1250-NEXT: s_lshr_b32 s14, s7, 16
+; GFX1250-NEXT: s_lshr_b32 s15, s7, 24
+; GFX1250-NEXT: s_bfe_u32 s27, s7, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s17, s17, s17
+; GFX1250-NEXT: s_add_co_i32 s16, s16, s16
+; GFX1250-NEXT: s_lshr_b32 s18, s1, 16
+; GFX1250-NEXT: s_lshr_b32 s19, s1, 24
+; GFX1250-NEXT: s_lshr_b32 s22, s3, 16
+; GFX1250-NEXT: s_lshr_b32 s23, s3, 24
+; GFX1250-NEXT: s_bfe_u32 s29, s1, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x80008
+; GFX1250-NEXT: s_add_co_i32 s21, s21, s21
+; GFX1250-NEXT: s_add_co_i32 s20, s20, s20
+; GFX1250-NEXT: s_lshl_b32 s17, s17, 8
+; GFX1250-NEXT: s_and_b32 s16, s16, 0xff
+; GFX1250-NEXT: s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT: s_add_co_i32 s27, s27, s27
+; GFX1250-NEXT: s_add_co_i32 s15, s15, s15
+; GFX1250-NEXT: s_add_co_i32 s14, s14, s14
+; GFX1250-NEXT: s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT: s_add_co_i32 s30, s30, s30
+; GFX1250-NEXT: s_add_co_i32 s23, s23, s23
+; GFX1250-NEXT: s_add_co_i32 s22, s22, s22
+; GFX1250-NEXT: s_lshl_b32 s21, s21, 8
+; GFX1250-NEXT: s_and_b32 s20, s20, 0xff
+; GFX1250-NEXT: s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT: s_add_co_i32 s29, s29, s29
+; GFX1250-NEXT: s_add_co_i32 s19, s19, s19
+; GFX1250-NEXT: s_add_co_i32 s18, s18, s18
+; GFX1250-NEXT: s_lshr_b32 s10, s5, 16
+; GFX1250-NEXT: s_lshr_b32 s11, s5, 24
+; GFX1250-NEXT: s_lshr_b32 s12, s6, 16
+; GFX1250-NEXT: s_lshr_b32 s13, s6, 24
+; GFX1250-NEXT: s_or_b32 s16, s16, s17
+; GFX1250-NEXT: s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT: s_lshl_b32 s17, s27, 8
+; GFX1250-NEXT: s_lshl_b32 s15, s15, 8
+; GFX1250-NEXT: s_and_b32 s14, s14, 0xff
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT: s_lshl_b32 s30, s30, 8
+; GFX1250-NEXT: s_lshl_b32 s23, s23, 8
+; GFX1250-NEXT: s_and_b32 s22, s22, 0xff
+; GFX1250-NEXT: s_or_b32 s20, s20, s21
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT: s_lshl_b32 s21, s29, 8
+; GFX1250-NEXT: s_lshl_b32 s19, s19, 8
+; GFX1250-NEXT: s_and_b32 s18, s18, 0xff
+; GFX1250-NEXT: s_lshr_b32 s8, s4, 16
+; GFX1250-NEXT: s_lshr_b32 s9, s4, 24
+; GFX1250-NEXT: s_bfe_u32 s24, s4, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s25, s5, 0x80008
+; GFX1250-NEXT: s_bfe_u32 s26, s6, 0x80008
+; GFX1250-NEXT: s_or_b32 s7, s7, s17
+; GFX1250-NEXT: s_or_b32 s14, s14, s15
+; GFX1250-NEXT: s_add_co_i32 s13, s13, s13
+; GFX1250-NEXT: s_add_co_i32 s12, s12, s12
+; GFX1250-NEXT: s_add_co_i32 s11, s11, s11
+; GFX1250-NEXT: s_add_co_i32 s10, s10, s10
+; GFX1250-NEXT: s_bfe_u32 s28, s0, 0x80008
+; GFX1250-NEXT: s_or_b32 s3, s3, s30
+; GFX1250-NEXT: s_or_b32 s22, s22, s23
+; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x80008
+; GFX1250-NEXT: s_or_b32 s1, s1, s21
+; GFX1250-NEXT: s_or_b32 s18, s18, s19
+; GFX1250-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s14, s14, 16
+; GFX1250-NEXT: s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT: s_add_co_i32 s26, s26, s26
+; GFX1250-NEXT: s_lshl_b32 s13, s13, 8
+; GFX1250-NEXT: s_and_b32 s12, s12, 0xff
+; GFX1250-NEXT: s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT: s_add_co_i32 s25, s25, s25
+; GFX1250-NEXT: s_lshl_b32 s11, s11, 8
+; GFX1250-NEXT: s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT: s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT: s_add_co_i32 s24, s24, s24
+; GFX1250-NEXT: s_add_co_i32 s9, s9, s9
+; GFX1250-NEXT: s_add_co_i32 s8, s8, s8
+; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s22, s22, 16
+; GFX1250-NEXT: s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT: s_add_co_i32 s23, s23, s23
+; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s18, s18, 16
+; GFX1250-NEXT: s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT: s_add_co_i32 s28, s28, s28
+; GFX1250-NEXT: s_or_b32 s7, s7, s14
+; GFX1250-NEXT: s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT: s_lshl_b32 s14, s26, 8
+; GFX1250-NEXT: s_or_b32 s12, s12, s13
+; GFX1250-NEXT: s_and_b32 s5, s5, 0xff
+; GFX1250-NEXT: s_lshl_b32 s13, s25, 8
+; GFX1250-NEXT: s_or_b32 s10, s10, s11
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT: s_lshl_b32 s11, s24, 8
+; GFX1250-NEXT: s_lshl_b32 s9, s9, 8
+; GFX1250-NEXT: s_and_b32 s8, s8, 0xff
+; GFX1250-NEXT: s_or_b32 s3, s3, s22
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT: s_lshl_b32 s22, s23, 8
+; GFX1250-NEXT: s_or_b32 s1, s1, s18
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_lshl_b32 s18, s28, 8
+; GFX1250-NEXT: s_or_b32 s6, s6, s14
+; GFX1250-NEXT: s_or_b32 s5, s5, s13
+; GFX1250-NEXT: s_or_b32 s4, s4, s11
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_or_b32 s2, s2, s22
+; GFX1250-NEXT: s_or_b32 s0, s0, s18
+; GFX1250-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s12, s12, 16
+; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s8, s8, 16
+; GFX1250-NEXT: s_lshl_b32 s9, s10, 16
+; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s20, s20, 16
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s16, s16, 16
+; GFX1250-NEXT: s_or_b32 s6, s6, s12
+; GFX1250-NEXT: s_or_b32 s4, s4, s8
+; GFX1250-NEXT: s_or_b32 s5, s5, s9
+; GFX1250-NEXT: s_or_b32 s2, s2, s20
+; GFX1250-NEXT: s_or_b32 s0, s0, s16
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX1250-NEXT: global_store_b128 v[10:11], v[4:7], off
+; GFX1250-NEXT: s_endpgm
entry:
%add = add <32 x i8> %arg0, %arg0
store <32 x i8> %add, ptr addrspace(1) null
@@ -2212,6 +2866,12 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store i1 %arg0, ptr addrspace(1) poison
ret void
}
@@ -2330,6 +2990,56 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v8i1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v3.l, v6.l, 1
+; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 1, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 2, v3.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v3.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v3.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v2.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT: flat_store_b8 v[0:1], v0
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v8i1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 4, v1
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT: s_endpgm
store <8 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -2545,6 +3255,94 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v16i1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_and_b16 v2.h, v6.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v4.h, v10.l, 1
+; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT: v_and_b16 v3.h, v8.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 2, v2.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.h, 3, v11.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v4.h, 2, v4.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v15.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.h, 1, v13.l
+; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v2.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 1, v5.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v5.l, v14.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 1, v9.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 3, v3.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.h, v3.h, 3, v2.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v4.h
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v4.h, v12.l, v6.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v2.h, 15, v3.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.h, v4.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 12, v1.h
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v0
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v16i1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v9, 1, v9
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v11, 3, v11
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v13, 1, v13
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 1, v10
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v10, 1, v14
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v14, 3, v15
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 2, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v10, 2, v10
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v5, v8, 3, v9 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v9, v12, v13, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v7, v11, v7
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v8, v14, v10
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v3, v5, 15, v7 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v9, v8, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 4, v1
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 12, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v3, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT: s_endpgm
store <16 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -2745,6 +3543,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
;
; GFX11-TRUE16-LABEL: amdgpu_cs_v32i1:
; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_and_b16 v26.l, v26.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, v22.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, v18.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1
@@ -2754,6 +3561,18 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 3, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 2, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v24.l, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 3, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 2, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 2, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1
@@ -2766,15 +3585,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 3, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 2, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, v22.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v22.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, v16.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 3
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l
; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l
@@ -2784,65 +3603,42 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v28.h, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v28.l, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v21.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v26.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v19.l, 15
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 15
; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 12, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v14.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -2947,6 +3743,170 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v32i1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v19.l, v22.l, 1
+; GFX1250-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v23.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.h, 1, v21.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 2, v19.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v3.h, v6.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.l, v16.l, v17.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v19.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.h, v20.l, v19.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_and_b16 v17.l, v26.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 2, v3.h
+; GFX1250-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v18.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v18.h, v24.l, 1
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.h, v17.h, v16.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.h, 1, v25.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 3, v27.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 2, v17.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v19.h, v30.l, 1
+; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v3.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 1, v5.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v5.l, v10.l, 1
+; GFX1250-TRUE16-NEXT: v_and_b16 v5.h, v14.l, 1
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.h, v18.h, 3, v17.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v17.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v31.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.l, 2, v19.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v19.h, 1, v29.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.h, 1, v9.l
+; GFX1250-TRUE16-NEXT: v_and_b16 v4.h, v8.l, 1
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v11.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v6.h, 3, v15.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v5.h, 2, v5.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v7.l, 1, v13.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v17.l, v17.h, 15, v17.l bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v16.l, v16.l, v18.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v19.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v18.l, v28.l, v19.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v3.l, 3, v3.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v3.h, v4.h, 3, v3.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v5.h
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v5.l, v12.l, v7.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.h, v18.l, v17.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v3.h, 15, v4.l bitop3:0xc8
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v5.l, v4.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v2.h, 12, v2.h
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b16 v1.h, 12, v1.h
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v3, v7, v6
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v10
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 1, v9
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v7, 1, v8
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v8, 3, v11
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v3, v4, v3, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v7, 3, v5 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v1, 1, v14
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 3, v23
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v5, v8, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 3, v15
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 2, v1
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 1, v13
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v4, v4, 15, v5 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 1, v22
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v9, 1, v26
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v11, 1, v30
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v12, v2, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 2, v5
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v6, 1, v18
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v10, 1, v25
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v12, 1, v24
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v13, 3, v27
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 1, v21
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v9, 2, v9
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v14, 3, v31
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v11, 2, v11
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v15, 1, v29
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v2, v1, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v2, 3, v19
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v8, 1, v17
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v7, v20, v7, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v10, v12, 3, v10 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v9, v13, v9
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v11, v14, v11
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v12, v28, v15, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v2, v2, v6
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v6, v16, v8, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v5, v7, v5, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v7, v10, 15, v9 bitop3:0xc8
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v8, v12, v11, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v3, 4, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v4, 8, v4
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v1, 12, v1
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v6, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v5, 4, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v6, 8, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b16 v7, 12, v8
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v3, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v2, v2, v5, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v3, v7, v6
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_bitop3_b16 v1, v2, v3, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX1250-FAKE16-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT: s_endpgm
store <32 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -2975,6 +3935,14 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s0, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store i1 %arg0, ptr addrspace(1) poison
ret void
}
@@ -3063,6 +4031,34 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v8i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s6, s6, 1
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT: s_and_b32 s4, s4, 1
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT: s_and_b32 s0, s0, 1
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT: s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s5, s7, s6
+; GFX1250-NEXT: s_and_b32 s4, s4, 3
+; GFX1250-NEXT: s_or_b32 s1, s3, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 3
+; GFX1250-NEXT: s_or_b32 s2, s4, s5
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT: s_and_b32 s0, s0, 15
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store <8 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -3223,6 +4219,58 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v16i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s10, s10, 1
+; GFX1250-NEXT: s_lshl_b32 s9, s9, 1
+; GFX1250-NEXT: s_and_b32 s8, s8, 1
+; GFX1250-NEXT: s_and_b32 s6, s6, 1
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT: s_and_b32 s4, s4, 1
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT: s_and_b32 s0, s0, 1
+; GFX1250-NEXT: s_and_b32 s14, s14, 1
+; GFX1250-NEXT: s_lshl_b32 s13, s13, 1
+; GFX1250-NEXT: s_and_b32 s12, s12, 1
+; GFX1250-NEXT: s_lshl_b32 s11, s11, 3
+; GFX1250-NEXT: s_lshl_b32 s10, s10, 2
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT: s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_lshl_b32 s15, s15, 3
+; GFX1250-NEXT: s_lshl_b32 s14, s14, 2
+; GFX1250-NEXT: s_or_b32 s12, s12, s13
+; GFX1250-NEXT: s_or_b32 s9, s11, s10
+; GFX1250-NEXT: s_and_b32 s8, s8, 3
+; GFX1250-NEXT: s_or_b32 s5, s7, s6
+; GFX1250-NEXT: s_and_b32 s4, s4, 3
+; GFX1250-NEXT: s_or_b32 s1, s3, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 3
+; GFX1250-NEXT: s_or_b32 s13, s15, s14
+; GFX1250-NEXT: s_and_b32 s12, s12, 3
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_or_b32 s2, s4, s5
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s10, s12, s13
+; GFX1250-NEXT: s_and_b32 s8, s8, 15
+; GFX1250-NEXT: s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT: s_and_b32 s0, s0, 15
+; GFX1250-NEXT: s_lshl_b32 s9, s10, 12
+; GFX1250-NEXT: s_lshl_b32 s2, s8, 8
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s1, s9, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store <16 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -3527,6 +4575,106 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v32i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s10, s10, 1
+; GFX1250-NEXT: s_lshl_b32 s9, s9, 1
+; GFX1250-NEXT: s_and_b32 s8, s8, 1
+; GFX1250-NEXT: s_and_b32 s14, s14, 1
+; GFX1250-NEXT: s_lshl_b32 s13, s13, 1
+; GFX1250-NEXT: s_and_b32 s12, s12, 1
+; GFX1250-NEXT: s_lshl_b32 s11, s11, 3
+; GFX1250-NEXT: s_lshl_b32 s10, s10, 2
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_and_b32 s6, s6, 1
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT: s_and_b32 s4, s4, 1
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT: s_and_b32 s0, s0, 1
+; GFX1250-NEXT: s_lshl_b32 s15, s15, 3
+; GFX1250-NEXT: s_lshl_b32 s14, s14, 2
+; GFX1250-NEXT: s_or_b32 s12, s12, s13
+; GFX1250-NEXT: s_or_b32 s9, s11, s10
+; GFX1250-NEXT: s_and_b32 s8, s8, 3
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT: s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s13, s15, s14
+; GFX1250-NEXT: s_and_b32 s12, s12, 3
+; GFX1250-NEXT: s_or_b32 s8, s8, s9
+; GFX1250-NEXT: s_or_b32 s5, s7, s6
+; GFX1250-NEXT: s_and_b32 s4, s4, 3
+; GFX1250-NEXT: s_or_b32 s1, s3, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 3
+; GFX1250-NEXT: s_or_b32 s10, s12, s13
+; GFX1250-NEXT: s_and_b32 s8, s8, 15
+; GFX1250-NEXT: s_or_b32 s2, s4, s5
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_lshl_b32 s9, s10, 12
+; GFX1250-NEXT: s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT: s_and_b32 s0, s0, 15
+; GFX1250-NEXT: s_lshl_b32 s2, s8, 8
+; GFX1250-NEXT: s_and_b32 s3, s30, 1
+; GFX1250-NEXT: s_lshl_b32 s4, s29, 1
+; GFX1250-NEXT: s_and_b32 s5, s28, 1
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s1, s9, s2
+; GFX1250-NEXT: s_lshl_b32 s2, s31, 3
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 2
+; GFX1250-NEXT: s_or_b32 s4, s5, s4
+; GFX1250-NEXT: s_and_b32 s5, s26, 1
+; GFX1250-NEXT: s_lshl_b32 s6, s25, 1
+; GFX1250-NEXT: s_and_b32 s7, s24, 1
+; GFX1250-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NEXT: s_and_b32 s3, s4, 3
+; GFX1250-NEXT: s_lshl_b32 s4, s27, 3
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 2
+; GFX1250-NEXT: s_or_b32 s6, s7, s6
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s5, s6, 3
+; GFX1250-NEXT: s_or_b32 s2, s3, s2
+; GFX1250-NEXT: s_or_b32 s3, s5, s4
+; GFX1250-NEXT: s_and_b32 s5, s22, 1
+; GFX1250-NEXT: s_lshl_b32 s6, s21, 1
+; GFX1250-NEXT: s_and_b32 s7, s20, 1
+; GFX1250-NEXT: s_lshl_b32 s4, s23, 3
+; GFX1250-NEXT: s_lshl_b32 s5, s5, 2
+; GFX1250-NEXT: s_or_b32 s6, s7, s6
+; GFX1250-NEXT: s_and_b32 s7, s18, 1
+; GFX1250-NEXT: s_lshl_b32 s8, s17, 1
+; GFX1250-NEXT: s_and_b32 s9, s16, 1
+; GFX1250-NEXT: s_or_b32 s4, s4, s5
+; GFX1250-NEXT: s_and_b32 s5, s6, 3
+; GFX1250-NEXT: s_lshl_b32 s6, s19, 3
+; GFX1250-NEXT: s_lshl_b32 s7, s7, 2
+; GFX1250-NEXT: s_or_b32 s8, s9, s8
+; GFX1250-NEXT: s_or_b32 s6, s6, s7
+; GFX1250-NEXT: s_and_b32 s7, s8, 3
+; GFX1250-NEXT: s_or_b32 s4, s5, s4
+; GFX1250-NEXT: s_or_b32 s5, s7, s6
+; GFX1250-NEXT: s_and_b32 s3, s3, 15
+; GFX1250-NEXT: s_lshl_b32 s4, s4, 4
+; GFX1250-NEXT: s_and_b32 s5, s5, 15
+; GFX1250-NEXT: s_lshl_b32 s2, s2, 12
+; GFX1250-NEXT: s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT: s_or_b32 s4, s5, s4
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NEXT: s_and_b32 s3, s4, 0xff
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: s_or_b32 s1, s3, s2
+; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT: s_lshl_b32 s1, s1, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_or_b32 s0, s0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store <32 x i1> %arg0, ptr addrspace(1) poison
ret void
}
@@ -3551,6 +4699,12 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1_sext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store i1 %arg0, ptr addrspace(1) poison
ret void
}
@@ -3572,6 +4726,11 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
; GFX11: ; %bb.0:
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1_zext:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
store i1 %arg0, ptr addrspace(1) poison
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/change-scc-to-vcc.mir b/llvm/test/CodeGen/AMDGPU/change-scc-to-vcc.mir
index 4ff3b5ab5..4ad8dde 100644
--- a/llvm/test/CodeGen/AMDGPU/change-scc-to-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/change-scc-to-vcc.mir
@@ -21,7 +21,6 @@ body: |
; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 killed [[DEF]], [[COPY]], implicit-def $vcc_lo, implicit $exec
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[DEF3]]
; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 killed [[DEF2]], [[COPY1]], implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
- ; GCN-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GCN-NEXT: [[V_ADDC_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 [[V_MUL_HI_U32_U24_e64_]], [[COPY2]], implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
%0:sreg_32 = S_MOV_B32 681
@@ -52,11 +51,9 @@ body: |
; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 killed [[DEF2]], [[DEF]], implicit-def $vcc_lo, implicit $exec
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[DEF4]]
; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 killed [[DEF3]], [[COPY]], implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
- ; GCN-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[DEF5]]
; GCN-NEXT: [[V_ADDC_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 killed [[V_ADDC_U32_e32_1]], [[COPY1]], implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index b9caf8e..ccdc0b1 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1561,10 +1561,10 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index f153b30..72a6e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -45,13 +45,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
undef %2.sub0:areg_64_align2 = COPY %0
%2.sub1:areg_64_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -72,7 +72,7 @@ body: |
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY3]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -80,7 +80,7 @@ body: |
undef %3.sub0:areg_96 = COPY %0
%3.sub1:areg_96 = COPY %1
%3.sub2:areg_96 = COPY %2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %3
SI_RETURN
...
@@ -101,7 +101,7 @@ body: |
; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY3]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -109,7 +109,7 @@ body: |
undef %3.sub0:areg_96_align2 = COPY %0
%3.sub1:areg_96_align2 = COPY %1
%3.sub2:areg_96_align2 = COPY %2
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %3
SI_RETURN
...
@@ -128,13 +128,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0
%2.sub2_sub3:areg_128 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -153,13 +153,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0
%2.sub2_sub3:areg_128_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -178,13 +178,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:sgpr_32 = COPY $sgpr8
%1:sgpr_32 = COPY $sgpr9
undef %2.sub0:areg_64_align2 = COPY %0
%2.sub1:areg_64_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -203,13 +203,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vreg_64 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96 = COPY %0
%2.sub1_sub2:areg_96 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -228,13 +228,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vreg_64 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96_align2 = COPY %0
%2.sub1_sub2:areg_96_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -253,13 +253,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vgpr_32 = COPY $vgpr2
undef %2.sub0_sub1:areg_96 = COPY %0
%2.sub2:areg_96 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -278,13 +278,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vgpr_32 = COPY $vgpr2
undef %2.sub0_sub1:areg_96_align2 = COPY %0
%2.sub2:areg_96_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -326,13 +326,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
undef %2.sub0:areg_64_align2 = COPY %0
%2.sub1:areg_64_align2 = COPY %1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -350,12 +350,12 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_96 = COPY %0
%1.sub1:areg_96 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %1
SI_RETURN
...
@@ -373,12 +373,12 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_96_align2 = COPY %0
%1.sub1:areg_96_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %1
SI_RETURN
...
@@ -398,14 +398,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0
%1.sub1:areg_128 = COPY %0
%1.sub2:areg_128 = COPY %0
%1.sub3:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -425,14 +425,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0
%1.sub1:areg_128_align2 = COPY %0
%1.sub2:areg_128_align2 = COPY %0
%1.sub3:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -558,13 +558,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
%0.sub1:vreg_64 = COPY $vgpr1
undef %2.sub0:areg_64_align2 = COPY %0.sub0
%2.sub1:areg_64_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -585,7 +585,7 @@ body: |
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 =COPY $vgpr0
%0.sub1:vreg_96 = COPY $vgpr1
@@ -593,7 +593,7 @@ body: |
undef %3.sub0:areg_96 = COPY %0.sub0
%3.sub1:areg_96 = COPY %0.sub1
%3.sub2:areg_96 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %3
SI_RETURN
...
@@ -614,7 +614,7 @@ body: |
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 =COPY $vgpr0
%0.sub1:vreg_96 = COPY $vgpr1
@@ -622,7 +622,7 @@ body: |
undef %3.sub0:areg_96_align2 = COPY %0.sub0
%3.sub1:areg_96_align2 = COPY %0.sub1
%3.sub2:areg_96_align2 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %3
SI_RETURN
...
@@ -641,13 +641,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -668,13 +668,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
%0.sub1:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
%2.sub2_sub3:areg_128_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -693,13 +693,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:sreg_64 = COPY $sgpr8
%0.sub1:sreg_64 = COPY $sgpr9
undef %2.sub0:areg_64_align2 = COPY %0.sub0
%2.sub1:areg_64_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -718,13 +718,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 =COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96 = COPY %0.sub0
%2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -743,13 +743,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 =COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96_align2 = COPY %0.sub0
%2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -768,13 +768,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
%0.sub2:vreg_96 = COPY $vgpr2
undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
%2.sub2:areg_96 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -793,13 +793,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
%0.sub2:vreg_96 = COPY $vgpr2
undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
%2.sub2:areg_96_align2 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -841,13 +841,13 @@ body: |
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_96 = COPY %0.sub0
%1.sub1:areg_96 = COPY %0.sub0
%1.sub2:areg_96 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %1
SI_RETURN
...
@@ -865,12 +865,12 @@ body: |
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_96_align2 = COPY %0.sub0
%1.sub1:areg_96_align2 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %1
SI_RETURN
...
@@ -890,14 +890,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0.sub0
%1.sub1:areg_128 = COPY %0.sub0
%1.sub2:areg_128 = COPY %0.sub0
%1.sub3:areg_128 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %1
SI_RETURN
...
@@ -917,14 +917,14 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0.sub0
%1.sub1:areg_128_align2 = COPY %0.sub0
%1.sub2:areg_128_align2 = COPY %0.sub0
%1.sub3:areg_128_align2 = COPY %0.sub0
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %1
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %1
SI_RETURN
...
@@ -968,13 +968,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64_align2 = COPY $vgpr0
%0.sub1:vreg_64_align2 = COPY $vgpr1
undef %2.sub0:areg_64_align2 = COPY %0.sub0
%2.sub1:areg_64_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -995,7 +995,7 @@ body: |
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1:vreg_96 = COPY $vgpr1
@@ -1003,7 +1003,7 @@ body: |
undef %3.sub0:areg_96 = COPY %0.sub0
%3.sub1:areg_96 = COPY %0.sub1
%3.sub2:areg_96 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %3
SI_RETURN
...
@@ -1024,7 +1024,7 @@ body: |
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96_align2 = COPY $vgpr0
%0.sub1:vreg_96_align2 = COPY $vgpr1
@@ -1032,7 +1032,7 @@ body: |
undef %3.sub0:areg_96_align2 = COPY %0.sub0
%3.sub1:areg_96_align2 = COPY %0.sub1
%3.sub2:areg_96_align2 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %3
SI_RETURN
...
@@ -1051,13 +1051,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1076,13 +1076,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -1101,13 +1101,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:sreg_64 = COPY $sgpr8
%0.sub1:sreg_64 = COPY $sgpr9
undef %2.sub0:areg_64_align2 = COPY %0.sub0
%2.sub1:areg_64_align2 = COPY %0.sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1126,13 +1126,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96 = COPY %0.sub0
%2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -1150,13 +1150,13 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96 = COPY %0.sub2
%2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -1176,13 +1176,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96_align2 = COPY $vgpr0
%0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96_align2 = COPY %0.sub0
%2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1201,13 +1201,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
%0.sub2:vreg_96 = COPY $vgpr2
undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
%2.sub2:areg_96 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %2
SI_RETURN
...
@@ -1226,13 +1226,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
%0.sub2:vreg_96_align2 = COPY $vgpr2
undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
%2.sub2:areg_96_align2 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1251,13 +1251,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
%0.sub2:vreg_96 = COPY $vgpr2
undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
%2.sub2:areg_96_align2 = COPY %0.sub2
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1295,11 +1295,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%2:areg_64_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1316,11 +1316,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
%3:areg_96 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 4718601 /* reguse:AReg_96 */, %3
SI_RETURN
...
@@ -1337,11 +1337,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
%3:areg_96_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ INLINEASM &"; use $0", 0 /* attdialect */, 5046281 /* reguse:AReg_96_Align2 */, %3
SI_RETURN
...
@@ -1358,11 +1358,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, killed %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
@@ -1379,11 +1379,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%2:areg_128_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
@@ -1400,11 +1400,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:sreg_64 = COPY $sgpr8_sgpr9
%2:areg_64_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
@@ -1421,11 +1421,11 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
%0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
%2:areg_96_align2 = COPY %0
- INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index c30ce8c..2b63a8c 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -437,7 +437,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s4, 2
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0
; GCN-O0-NEXT: s_mov_b32 s4, 0
-; GCN-O0-NEXT: ; implicit-def: $sgpr4
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -668,7 +667,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s0, 2
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0
; GCN-O0-NEXT: s_mov_b32 s1, 0
-; GCN-O0-NEXT: ; implicit-def: $sgpr1
; GCN-O0-NEXT: s_waitcnt expcnt(0)
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index b5bc09a..26f204f 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -946,9 +946,9 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1770,40 +1770,38 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-TRUE16-LABEL: load_v4i8_to_v4f32_2_uses:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1]
-; GFX11-TRUE16-NEXT: global_store_b32 v5, v4, s[2:3]
+; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: global_store_b32 v6, v4, s[2:3]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: load_v4i8_to_v4f32_2_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index 9a98a7c..12de375 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -42,7 +42,7 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw sub ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -64,7 +64,7 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw and ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -86,7 +86,7 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw or ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -108,7 +108,7 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw xor ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -144,7 +144,7 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw nand ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -166,7 +166,7 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
+ %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -188,7 +188,7 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw max ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -210,7 +210,7 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
+ %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -232,7 +232,7 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw min ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -254,7 +254,7 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
+ %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -276,7 +276,7 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw umax ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -298,7 +298,7 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic
+ %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("workgroup") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -320,7 +320,7 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw umin ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -388,7 +388,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -410,7 +410,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic
+ %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -446,7 +446,7 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic
+ %f32 = atomicrmw fadd ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n32 = fptoui float %f32 to i32
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
@@ -483,7 +483,7 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic
+ %f32 = atomicrmw fsub ptr addrspace(1) %p, float 1.0 syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%n32 = fptoui float %f32 to i32
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
index d63a36c..0c147b5 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -107,11 +107,7 @@ define i64 @v_or_i64_disjoint(i64 %a, i64 %b) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 85180a2..c429b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942 %s
define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
; GCN-LABEL: select_and1:
@@ -56,24 +57,43 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
}
define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
-; GCN-LABEL: select_and_v4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_gt_i32 s8, 10
-; GCN-NEXT: s_cselect_b32 s3, s3, 0
-; GCN-NEXT: s_cselect_b32 s2, s2, 0
-; GCN-NEXT: s_cselect_b32 s1, s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: select_and_v4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_gt_i32 s8, 10
+; GFX9-NEXT: s_cselect_b32 s3, s3, 0
+; GFX9-NEXT: s_cselect_b32 s2, s2, 0
+; GFX9-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: select_and_v4:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_gt_i32 s8, 10
+; GFX942-NEXT: s_cselect_b32 s3, s3, 0
+; GFX942-NEXT: s_cselect_b32 s2, s2, 0
+; GFX942-NEXT: s_cselect_b32 s1, s1, 0
+; GFX942-NEXT: s_cselect_b32 s0, s0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%a = and <4 x i32> %s, %y
@@ -136,24 +156,43 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
}
define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
-; GCN-LABEL: select_or_v4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s8, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lt_i32 s8, 11
-; GCN-NEXT: s_cselect_b32 s3, s3, -1
-; GCN-NEXT: s_cselect_b32 s2, s2, -1
-; GCN-NEXT: s_cselect_b32 s1, s1, -1
-; GCN-NEXT: s_cselect_b32 s0, s0, -1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: select_or_v4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lt_i32 s8, 11
+; GFX9-NEXT: s_cselect_b32 s3, s3, -1
+; GFX9-NEXT: s_cselect_b32 s2, s2, -1
+; GFX9-NEXT: s_cselect_b32 s1, s1, -1
+; GFX9-NEXT: s_cselect_b32 s0, s0, -1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: select_or_v4:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_lt_i32 s8, 11
+; GFX942-NEXT: s_cselect_b32 s3, s3, -1
+; GFX942-NEXT: s_cselect_b32 s2, s2, -1
+; GFX942-NEXT: s_cselect_b32 s1, s1, -1
+; GFX942-NEXT: s_cselect_b32 s0, s0, -1
+; GFX942-NEXT: v_mov_b32_e32 v2, s0
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, s2
+; GFX942-NEXT: v_mov_b32_e32 v5, s3
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT: s_endpgm
%c = icmp slt i32 %x, 11
%s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
%a = or <4 x i32> %s, %y
@@ -236,23 +275,41 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr ad
}
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
-; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_bitcmp1_b32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, 7, 14
-; GCN-NEXT: s_cselect_b32 s3, 6, 10
-; GCN-NEXT: s_cselect_b32 s4, 5, 6
-; GCN-NEXT: s_cselect_b32 s5, 9, 2
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s2, 7, 14
+; GFX9-NEXT: s_cselect_b32 s3, 6, 10
+; GFX9-NEXT: s_cselect_b32 s4, 5, 6
+; GFX9-NEXT: s_cselect_b32 s5, 9, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bitcmp1_b32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 7, 14
+; GFX942-NEXT: s_cselect_b32 s3, 6, 10
+; GFX942-NEXT: s_cselect_b32 s4, 5, 6
+; GFX942-NEXT: s_cselect_b32 s5, 9, 2
+; GFX942-NEXT: v_mov_b32_e32 v2, s5
+; GFX942-NEXT: v_mov_b32_e32 v3, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s3
+; GFX942-NEXT: v_mov_b32_e32 v5, s2
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT: s_endpgm
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
%bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
store <4 x i32> %bo, ptr addrspace(1) %p, align 32
@@ -461,24 +518,43 @@ define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p
}
define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
-; GCN-LABEL: fsub_constant_sel_constants_v4f32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0x41500000
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_bitcmp1_b32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, s3, 0x40c00000
-; GCN-NEXT: s_cselect_b32 s3, 0x41100000, 4.0
-; GCN-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0
-; GCN-NEXT: s_cselect_b32 s5, 1.0, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GCN-NEXT: s_endpgm
+; GFX9-LABEL: fsub_constant_sel_constants_v4f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0x41500000
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s2, s3, 0x40c00000
+; GFX9-NEXT: s_cselect_b32 s3, 0x41100000, 4.0
+; GFX9-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0
+; GFX9-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: fsub_constant_sel_constants_v4f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0x41500000
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bitcmp1_b32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, s3, 0x40c00000
+; GFX942-NEXT: s_cselect_b32 s3, 0x41100000, 4.0
+; GFX942-NEXT: s_cselect_b32 s4, 0x40a00000, 2.0
+; GFX942-NEXT: s_cselect_b32 s5, 1.0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, s5
+; GFX942-NEXT: v_mov_b32_e32 v3, s4
+; GFX942-NEXT: v_mov_b32_e32 v4, s3
+; GFX942-NEXT: v_mov_b32_e32 v5, s2
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT: s_endpgm
%sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
%bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
store <4 x float> %bo, ptr addrspace(1) %p, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index c126f9e..96a2d02e5 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll
new file mode 100644
index 0000000..75aaec6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=amdgpu-preload-kernel-arguments -amdgpu-kernarg-preload=0 %s -o - | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=amdgpu-preload-kernel-arguments %s -o - | FileCheck -check-prefix=DEFAULT-PRELOAD %s
+
+@g1 = protected addrspace(1) externally_initialized global i16 0, align 2
+
+define amdgpu_kernel void @test_kernel_with_zero_kernel_arg() {
+; NO-PRELOAD-LABEL: define amdgpu_kernel void @test_kernel_with_zero_kernel_arg(
+; NO-PRELOAD-SAME: ) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; NO-PRELOAD-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; NO-PRELOAD-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) @g1, align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; DEFAULT-PRELOAD-LABEL: define amdgpu_kernel void @test_kernel_with_zero_kernel_arg(
+; DEFAULT-PRELOAD-SAME: i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0:[0-9]+]] {
+; DEFAULT-PRELOAD-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; DEFAULT-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12
+; DEFAULT-PRELOAD-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2
+; DEFAULT-PRELOAD-NEXT: store i16 [[_HIDDEN_GROUP_SIZE_X]], ptr addrspace(1) @g1, align 2
+; DEFAULT-PRELOAD-NEXT: ret void
+;
+ %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12
+ %group_size_x = load i16, ptr addrspace(4) %gep
+ store i16 %group_size_x, ptr addrspace(1) @g1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 747affa..49ba0e2 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -228,27 +228,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9
@@ -268,8 +256,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
@@ -278,20 +264,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
@@ -305,8 +285,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v7, v12, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v14, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18
@@ -315,20 +293,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v14, v7, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v12, v7, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: v_xor_b32_e64 v14, v14, v19
@@ -390,7 +362,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8
; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8
; GFX9-O0-NEXT: s_mov_b32 s12, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr14
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10
@@ -399,7 +370,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9
; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr14
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
@@ -418,8 +388,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -428,7 +396,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr16
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
@@ -437,7 +404,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4
@@ -455,8 +421,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
@@ -471,15 +435,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -520,8 +480,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -529,8 +487,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
@@ -765,8 +721,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
@@ -799,12 +753,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
@@ -822,12 +772,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
@@ -950,8 +896,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
@@ -964,8 +908,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b32 s8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
@@ -983,12 +925,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
@@ -1062,12 +1000,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
@@ -1110,8 +1044,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
@@ -1121,8 +1053,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
@@ -1201,12 +1131,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
@@ -2501,26 +2427,14 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -2571,7 +2485,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6
; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6
; GFX9-O0-NEXT: s_mov_b32 s8, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -2580,7 +2493,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4
@@ -2601,15 +2513,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
@@ -2618,7 +2527,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4
@@ -2638,8 +2546,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
@@ -2656,15 +2562,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -2702,16 +2604,12 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
@@ -2946,8 +2844,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
@@ -2980,12 +2876,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
@@ -3003,12 +2895,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
@@ -3131,8 +3019,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
@@ -3145,8 +3031,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b32 s8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
@@ -3164,12 +3048,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
@@ -3243,12 +3123,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
@@ -3291,8 +3167,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
@@ -3302,8 +3176,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
@@ -4415,17 +4287,11 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
@@ -4445,13 +4311,9 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_mov_b32 s4, 33
@@ -4549,16 +4411,10 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: s_mov_b32 s4, 33
; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index bf37ccf..43f6def 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -1,12 +1,13 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A,DPP64-GFX9 -DCTL=row_newbcast
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,DPP64-GFX9,GFX942 -DCTL=row_newbcast
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10 -DCTL=row_share
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11 -DCTL=row_share
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX1250 -DCTL=row_share
; GCN-LABEL: {{^}}dpp64_ceil:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
-; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP64: v_ceil_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
@@ -21,8 +22,8 @@ define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) {
; GCN-LABEL: {{^}}dpp64_rcp:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
-; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP64-GFX9: v_rcp_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_rcp(ptr addrspace(1) %arg, i64 %in1) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
@@ -52,9 +53,9 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(ptr addrspace(1) %arg, i64
; GCN-LABEL: {{^}}dpp64_div:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
-; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GCN: v_div_scale_f64
; GCN: v_rcp_f64_e32
define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
@@ -69,6 +70,25 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
ret void
}
+; On GFX9 it fails to combine because v_mul_lo_u32 has no e32 or dpp form.
+; GCN-LABEL: {{^}}dpp_mul_row_share:
+; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
+; DPP64-GFX9: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
+; DPP64-GFX9: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; DPP64-GFX9: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
+; GFX1250: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
+; GFX1250: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; GFX1250: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
+define amdgpu_kernel void @dpp_mul_row_share(ptr addrspace(1) %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+ %load = load i32, ptr addrspace(1) %gep
+ %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 336, i32 15, i32 15, i1 1)
+ %mul = mul i32 %tmp0, %load
+ store i32 %mul, ptr addrspace(1) %gep
+ ret void
+}
+
; GCN-LABEL: {{^}}dpp64_loop:
; GCN: v_mov_b32_dpp
; DPP64: v_mov_b32_dpp
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index fb20e72..3725384 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,6 +1,6 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
---
@@ -8,8 +8,7 @@
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
-# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
-# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: vop3
tracksRegLiveness: true
body: |
@@ -39,12 +38,9 @@ body: |
# GCN-LABEL: name: vop3_sgpr_src1
# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
-# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
-# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
-# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
-# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
-# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
-# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
name: vop3_sgpr_src1
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir
new file mode 100644
index 0000000..9972ec8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx1250.mir
@@ -0,0 +1,18 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=gcn-dpp-combine -o - %s | FileCheck %s -check-prefix=GFX1250
+
+---
+name: v_bitop3_dpp
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX1250-LABEL: name: v_bitop3_dpp
+ ; GFX1250: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX1250-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX1250-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: [[V_BITOP3_B32_e64_dpp:%[0-9]+]]:vgpr_32 = V_BITOP3_B32_e64_dpp [[DEF]], [[V_MOV_B32_e32_]], 1, [[V_MOV_B32_dpp]], 128, 0, 15, 15, 1, implicit $exec
+ %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 0, 15, 15, 0, implicit $exec
+ %2:vgpr_32 = V_MOV_B32_dpp %0, %0, 0, 0, 0, 0, implicit $exec
+ %3:vgpr_32 = V_BITOP3_B32_e64 %1, 1, %2, 128, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index d646460..9936862 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -28,6 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
attributes #0 = { "amdgpu-no-dispatch-id" }
;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll
new file mode 100644
index 0000000..a7c1c22
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym.ll
@@ -0,0 +1,70 @@
+; Test generation of _dvgpr$ symbol for an amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=DVGPR %s
+
+; Function with 0 VGPRs, which counts as 1 block.
+;
+; DVGPR-LABEL: func0:
+; DVGPR: .set _dvgpr$func0, func0+0
+;
+define amdgpu_cs_chain void @func0() #0 {
+ ret void
+}
+
+; Function with 21 VGPRs, which is 2 blocks.
+;
+; DVGPR-LABEL: func21:
+; DVGPR: .set func21.num_vgpr, 21
+; DVGPR: .set _dvgpr$func21, func21+8
+;
+define amdgpu_cs_chain void @func21(<13 x float> %arg) #0 {
+ tail call void @func21(<13 x float> %arg)
+ ret void
+}
+
+; Anonymous function with 87 VGPRs, which is 6 blocks.
+;
+; DVGPR: [[FUNC87:__unnamed[^:]*]]:
+; DVGPR: .set [[FUNC87]].num_vgpr, 87
+; DVGPR: .set _dvgpr$[[FUNC87]], [[FUNC87]]+40
+;
+define amdgpu_cs_chain void @0(<79 x float> %arg) #0 {
+ tail call void @0(<79 x float> %arg)
+ ret void
+}
+
+; Function with 128 VGPRs, which is 8 blocks.
+;
+; DVGPR-LABEL: func128:
+; DVGPR: .set func128.num_vgpr, 128
+; DVGPR: .set _dvgpr$func128, func128+56
+;
+define amdgpu_cs_chain void @func128(<120 x float> %arg) #0 {
+ tail call void @func128(<120 x float> %arg)
+ ret void
+}
+
+; Function with 79 VGPRs, which is 3 blocks with a block size of 32.
+;
+; DVGPR-LABEL: func79:
+; DVGPR: .set func79.num_vgpr, 79
+; DVGPR: .set _dvgpr$func79, func79+16
+;
+define amdgpu_cs_chain void @func79(<71 x float> %arg) #1 {
+ tail call void @func79(<71 x float> %arg)
+ ret void
+}
+
+; Function with 225 VGPRs, which is 8 blocks with a block size of 32.
+;
+; DVGPR-LABEL: func225:
+; DVGPR: .set func225.num_vgpr, 225
+; DVGPR: .set _dvgpr$func225, func225+56
+;
+define amdgpu_cs_chain void @func225(<217 x float> %arg) #1 {
+ tail call void @func225(<217 x float> %arg)
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }
+attributes #1 = { "amdgpu-dynamic-vgpr-block-size"="32" }
diff --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll
new file mode 100644
index 0000000..362a5e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16.ll
@@ -0,0 +1,15 @@
+; Test failure to generate the _dvgpr$ symbol for an amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefixes=ERR %s
+
+; Function with 129 VGPRs, which is too many with a block size of 16.
+;
+; ERR-DAG: .set func129.num_vgpr, 129
+; ERR-DAG: too many DVGPR blocks for _dvgpr$ symbol for 'func129'
+;
+define amdgpu_cs_chain void @func129(<121 x float> %arg) #0 {
+ tail call void @func129(<121 x float> %arg)
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }
diff --git a/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll
new file mode 100644
index 0000000..218c009
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dvgpr_sym_fail_too_many_block_size_16_anon.ll
@@ -0,0 +1,24 @@
+; Test failure to generate the _dvgpr$ symbol for an anonymous amdgpu_cs_chain function with dynamic vgprs.
+
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefixes=ERR %s
+
+; Anonymous function with 129 VGPRs, which is too many with a block size of 16.
+;
+; ERR-DAG: .set __unnamed_1.num_vgpr, 129
+; ERR-DAG: too many DVGPR blocks for _dvgpr$ symbol for '__unnamed_1'
+;
+define amdgpu_cs_chain void @0(<121 x float> %arg) #0 {
+ tail call void @0(<121 x float> %arg)
+ ret void
+}
+
+; Function that is OK, that chains to @1.
+;
+define amdgpu_cs_chain void @funcOk(<16 x float> %arg) {
+ %vec87 = shufflevector <16 x float> %arg, <16 x float> %arg, <121 x i32> splat(i32 0)
+ tail call void @0(<121 x float> %vec87)
+ ret void
+}
+
+attributes #0 = { "amdgpu-dynamic-vgpr-block-size"="16" }
+
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index 3f49953..ac30297 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -154,26 +154,31 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
; CHECK-LABEL: realign_stack:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
; CHECK-NEXT: s_cmp_lg_u32 0, s33
-; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo
+; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi
; CHECK-NEXT: s_cmovk_i32 s33, 0x200
-; CHECK-NEXT: s_movk_i32 s32, 0x100
+; CHECK-NEXT: s_mov_b32 s0, callee@abs32@lo
+; CHECK-NEXT: scratch_store_b32 off, v32, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: s_clause 0x7
-; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96
-; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
+; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64
-; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
+; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32
+; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16
; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33
; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_movk_i32 s32, 0x100
; CHECK-NEXT: s_cmovk_i32 s32, 0x300
; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
; CHECK-NEXT: s_alloc_vgpr 0
; CHECK-NEXT: s_endpgm
%v = alloca <32 x i32>, align 128, addrspace(5)
+ ; use volatile store to avoid promotion of alloca to registers
+ store volatile i32 0, ptr addrspace(5) %v
store <32 x i32> %x, ptr addrspace(5) %v
call amdgpu_gfx void @callee(i32 71)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index b0439b1..c5db7a3 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -2536,12 +2536,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SDAG-NEXT: s_mov_b32 s4, s33
; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo
; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
; GFX11-SDAG-NEXT: s_mov_b32 s33, s32
-; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15
; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/empty-text.ll b/llvm/test/CodeGen/AMDGPU/empty-text.ll
new file mode 100644
index 0000000..8aa8600
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/empty-text.ll
@@ -0,0 +1,9 @@
+; Test that there is no s_code_end padding if .text is otherwise empty.
+
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN
+
+@globalVar = global i32 37
+
+declare amdgpu_ps void @funcDecl()
+
+; GCN-NOT: .fill
diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
index f58cb84..839d0ba 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
@@ -38,11 +38,11 @@ define hidden void @copy(ptr noundef %va) {
; CHECK-NEXT: %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
; CHECK-NEXT: %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
; CHECK-NEXT: store ptr %va, ptr addrspace(5) %va.addr, align 8
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %cp)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr %cp.ascast, ptr %va.addr.ascast, i32 8, i1 false)
; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %cp, align 8
; CHECK-NEXT: call void @valist(ptr noundef %0)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %cp)
; CHECK-NEXT: ret void
;
entry:
@@ -51,43 +51,43 @@ entry:
%va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
%cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
store ptr %va, ptr addrspace(5) %va.addr, align 8
- call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp)
+ call void @llvm.lifetime.start.p5(ptr addrspace(5) %cp)
call void @llvm.va_copy.p0(ptr %cp.ascast, ptr nonnull %va.addr.ascast)
%0 = load ptr, ptr addrspace(5) %cp, align 8
call void @valist(ptr noundef %0)
- call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp)
+ call void @llvm.lifetime.end.p5(ptr addrspace(5) %cp)
ret void
}
-declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.lifetime.start.p5(ptr addrspace(5) nocapture)
declare void @llvm.va_copy.p0(ptr, ptr)
declare hidden void @valist(ptr noundef)
-declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.lifetime.end.p5(ptr addrspace(5) nocapture)
define hidden void @start_once(...) {
; CHECK-LABEL: define {{[^@]+}}@start_once(ptr %varargs) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %s = alloca ptr, align 8, addrspace(5)
; CHECK-NEXT: %s.ascast = addrspacecast ptr addrspace(5) %s to ptr
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %s)
; CHECK-NEXT: store ptr %varargs, ptr %s.ascast, align 8
; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %s, align 8
; CHECK-NEXT: call void @valist(ptr noundef %0)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %s)
; CHECK-NEXT: ret void
;
entry:
%s = alloca ptr, align 8, addrspace(5)
%s.ascast = addrspacecast ptr addrspace(5) %s to ptr
- call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s)
+ call void @llvm.lifetime.start.p5(ptr addrspace(5) %s)
call void @llvm.va_start.p0(ptr %s.ascast)
%0 = load ptr, ptr addrspace(5) %s, align 8
call void @valist(ptr noundef %0)
call void @llvm.va_end.p0(ptr %s.ascast)
- call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s)
+ call void @llvm.lifetime.end.p5(ptr addrspace(5) %s)
ret void
}
@@ -102,16 +102,16 @@ define hidden void @start_twice(...) {
; CHECK-NEXT: %s1 = alloca ptr, align 8, addrspace(5)
; CHECK-NEXT: %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
; CHECK-NEXT: %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %s0)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %s1)
; CHECK-NEXT: store ptr %varargs, ptr %s0.ascast, align 8
; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %s0, align 8
; CHECK-NEXT: call void @valist(ptr noundef %0)
; CHECK-NEXT: store ptr %varargs, ptr %s1.ascast, align 8
; CHECK-NEXT: %1 = load ptr, ptr addrspace(5) %s1, align 8
; CHECK-NEXT: call void @valist(ptr noundef %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %s1)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %s0)
; CHECK-NEXT: ret void
;
entry:
@@ -119,8 +119,8 @@ entry:
%s1 = alloca ptr, align 8, addrspace(5)
%s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
%s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
- call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0)
- call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1)
+ call void @llvm.lifetime.start.p5(ptr addrspace(5) %s0)
+ call void @llvm.lifetime.start.p5(ptr addrspace(5) %s1)
call void @llvm.va_start.p0(ptr %s0.ascast)
%0 = load ptr, ptr addrspace(5) %s0, align 8
call void @valist(ptr noundef %0)
@@ -129,8 +129,8 @@ entry:
%1 = load ptr, ptr addrspace(5) %s1, align 8
call void @valist(ptr noundef %1)
call void @llvm.va_end.p0(ptr %s1.ascast)
- call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1)
- call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0)
+ call void @llvm.lifetime.end.p5(ptr addrspace(5) %s1)
+ call void @llvm.lifetime.end.p5(ptr addrspace(5) %s0)
ret void
}
@@ -138,12 +138,12 @@ define hidden void @single_i32(i32 noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -157,12 +157,12 @@ define hidden void @single_double(double noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_double.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store double %x, ptr addrspace(5) %0, align 8
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -174,12 +174,12 @@ define hidden void @single_v4f32(<4 x float> noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_v4f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <4 x float> %x, ptr addrspace(5) %0, align 16
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -191,12 +191,12 @@ define hidden void @single_v8f32(<8 x float> noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_v8f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <8 x float> %x, ptr addrspace(5) %0, align 32
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -208,12 +208,12 @@ define hidden void @single_v16f32(<16 x float> noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_v16f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <16 x float> %x, ptr addrspace(5) %0, align 64
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -225,12 +225,12 @@ define hidden void @single_v32f32(<32 x float> noundef %x) {
; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %single_v32f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %single_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <32 x float> %x, ptr addrspace(5) %0, align 128
; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %1)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -242,14 +242,14 @@ define hidden void @i32_double(i32 noundef %x, double noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_double.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store double %y, ptr addrspace(5) %1, align 8
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -261,14 +261,14 @@ define hidden void @double_i32(double noundef %x, i32 noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %double_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store double %x, ptr addrspace(5) %0, align 8
; CHECK-NEXT: %1 = getelementptr inbounds nuw %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -286,14 +286,14 @@ define hidden void @i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32
; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %y.coerce3, 3
; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %y.coerce4, 4
; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %y.coerce5, 5
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -317,14 +317,14 @@ define hidden void @libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64
; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3
; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4
; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %0, align 8
; CHECK-NEXT: %1 = getelementptr inbounds nuw %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -342,14 +342,14 @@ define hidden void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_v4f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store <4 x float> %y, ptr addrspace(5) %1, align 16
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -361,14 +361,14 @@ define hidden void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %v4f32_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <4 x float> %x, ptr addrspace(5) %0, align 16
; CHECK-NEXT: %1 = getelementptr inbounds nuw %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -380,14 +380,14 @@ define hidden void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_v8f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store <8 x float> %y, ptr addrspace(5) %1, align 32
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -399,14 +399,14 @@ define hidden void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %v8f32_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <8 x float> %x, ptr addrspace(5) %0, align 32
; CHECK-NEXT: %1 = getelementptr inbounds nuw %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -418,14 +418,14 @@ define hidden void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_v16f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store <16 x float> %y, ptr addrspace(5) %1, align 64
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -437,14 +437,14 @@ define hidden void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %v16f32_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <16 x float> %x, ptr addrspace(5) %0, align 64
; CHECK-NEXT: %1 = getelementptr inbounds nuw %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -456,14 +456,14 @@ define hidden void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %i32_v32f32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4
; CHECK-NEXT: %1 = getelementptr inbounds nuw %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store <32 x float> %y, ptr addrspace(5) %1, align 128
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -475,14 +475,14 @@ define hidden void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %v32f32_i32.vararg, align 4, addrspace(5)
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %0 = getelementptr inbounds nuw %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store <32 x float> %x, ptr addrspace(5) %0, align 128
; CHECK-NEXT: %1 = getelementptr inbounds nuw %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void @vararg(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -495,12 +495,12 @@ define hidden void @fptr_single_i32(i32 noundef %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %vararg_buffer = alloca %fptr_single_i32.vararg, align 4, addrspace(5)
; CHECK-NEXT: %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %1 = getelementptr inbounds nuw %fptr_single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store i32 %x, ptr addrspace(5) %1, align 4
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void %0(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
@@ -520,12 +520,12 @@ define hidden void @fptr_libcS(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i6
; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3
; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4
; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5
-; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: %1 = getelementptr inbounds nuw %fptr_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8
; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr
; CHECK-NEXT: call void %0(ptr %2)
-; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) %vararg_buffer)
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
index e1ce534..4349b18 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
@@ -29,6 +31,11 @@
; GFX1200-MESA: .long 45100
; GFX1200-MESA-NEXT: .long 1024
+; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
+
+; GFX1250-MESA: .long 45100
+; GFX1250-MESA-NEXT: .long 512
+
@lds = internal addrspace(3) global [4096 x i8] poison
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
index 01ebe7d..00cde42 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
@@ -2,12 +2,68 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
-/* TODO: Support safe bf16 fdiv lowering.
define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
+; GFX1250-TRUE16-LABEL: v_fdiv_bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX1250-TRUE16-NEXT: v_div_scale_f32 v0, null, v2, v2, v1
+; GFX1250-TRUE16-NEXT: v_div_scale_f32 v4, vcc_lo, v1, v2, v1
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_f32_e32 v3, v0
+; GFX1250-TRUE16-NEXT: s_denorm_mode 15
+; GFX1250-TRUE16-NEXT: v_nop
+; GFX1250-TRUE16-NEXT: v_fma_f32 v5, -v0, v3, 1.0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX1250-TRUE16-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_fma_f32 v6, -v0, v5, v4
+; GFX1250-TRUE16-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_fma_f32 v0, -v0, v5, v4
+; GFX1250-TRUE16-NEXT: s_denorm_mode 12
+; GFX1250-TRUE16-NEXT: v_div_fmas_f32 v0, v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_div_fixup_f32 v0, v0, v2, v1
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_fdiv_bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
+; GFX1250-FAKE16-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX1250-FAKE16-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1250-FAKE16-NEXT: s_denorm_mode 15
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX1250-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX1250-FAKE16-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX1250-FAKE16-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX1250-FAKE16-NEXT: s_denorm_mode 12
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX1250-FAKE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%fdiv = fdiv bfloat %x, %y
ret bfloat %fdiv
}
-*/
define bfloat @v_rcp_bf16(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rcp_bf16:
@@ -82,67 +138,59 @@ define bfloat @v_rcp_bf16_neg(bfloat %x) {
ret bfloat %fdiv
}
-; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
-; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat 1.0, %sqrt
ret bfloat %fdiv
}
-; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_neg(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: v_nop
; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: v_nop
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
%fdiv = fdiv contract bfloat -1.0, %sqrt
ret bfloat %fdiv
}
-; TODO: Support lowering to v_rsq_bf16.
define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v1.h, v1.l
; GFX1250-TRUE16-NEXT: v_nop
-; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
@@ -150,10 +198,9 @@ define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
; GFX1250-FAKE16: ; %bb.0:
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
-; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v0
; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
@@ -163,7 +210,6 @@ define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
ret <2 x bfloat> %r2
}
-; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
; GFX1250-TRUE16: ; %bb.0:
@@ -187,7 +233,6 @@ define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
ret bfloat %fdiv
}
-; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
; GFX1250-TRUE16: ; %bb.0:
@@ -211,7 +256,6 @@ define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
ret bfloat %fdiv
}
-; TODO: Support lowering to v_rsq_bf16.
define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
; GFX1250-TRUE16: ; %bb.0:
@@ -240,11 +284,8 @@ define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
@@ -252,12 +293,9 @@ define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
-; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
-; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v1
; GFX1250-FAKE16-NEXT: v_nop
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -271,11 +309,11 @@ define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-TRUE16: ; %bb.0:
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
-; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h
-; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_rsq_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v0.h
+; GFX1250-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
@@ -283,13 +321,12 @@ define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v0, v0
; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
-; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
-; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
-; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_rsq_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index ea1ae04..46ca6d3 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -9,7 +9,7 @@ body: |
; CHECK-LABEL: name: test_overlap
; CHECK: liveins: $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr3_vgpr4, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr1_vgpr2 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr3_vgpr4, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr1_vgpr2 {
; CHECK-NEXT: $vgpr2_vgpr3 = V_LSHLREV_B64_pseudo_e32 1, $vgpr0_vgpr1, implicit $exec
; CHECK-NEXT: $vgpr3_vgpr4 = V_LSHLREV_B64_pseudo_e32 1, $vgpr1_vgpr2, implicit $exec
; CHECK-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 4f752d1..7e9f21b 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -50,7 +50,6 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX90A-NEXT: v_mov_b32_e32 v4, s4
; GFX90A-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: ; implicit-def: $sgpr4
; GFX90A-NEXT: v_readfirstlane_b32 s4, v1
; GFX90A-NEXT: s_mov_b32 m0, s4
; GFX90A-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
index 5d90bab..635ff4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
@@ -11,7 +11,6 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CMP_LT_F16_fake16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_fake16_e64 0, [[V_CVT_F16_U16_fake16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_fake16_e64_]], implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
@@ -31,7 +30,6 @@ body: |
; GCN-LABEL: name: cvt_hi_f32_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_fake16_e64_]], implicit $exec
; GCN-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_fake16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
index 1ec7249..7a02698 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-fake16.mir
@@ -8,7 +8,6 @@ body: |
; GCN-LABEL: name: v_s_exp_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -23,7 +22,6 @@ body: |
; GCN-LABEL: name: v_s_log_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -38,7 +36,6 @@ body: |
; GCN-LABEL: name: v_s_rcp_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -53,7 +50,6 @@ body: |
; GCN-LABEL: name: v_s_rsq_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -68,7 +64,6 @@ body: |
; GCN-LABEL: name: v_s_sqrt_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 1, [[V_CVT_F32_U32_e64_]], 1, 1, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir
index 5194d25..28d7cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-gfx12-true16.mir
@@ -8,7 +8,6 @@ body: |
; GCN-LABEL: name: v_s_exp_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -23,7 +22,6 @@ body: |
; GCN-LABEL: name: v_s_log_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -38,7 +36,6 @@ body: |
; GCN-LABEL: name: v_s_rcp_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -53,7 +50,6 @@ body: |
; GCN-LABEL: name: v_s_rsq_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -68,7 +64,6 @@ body: |
; GCN-LABEL: name: v_s_sqrt_f16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 1, [[V_CVT_F32_U32_e64_]].lo16, 1, 1, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index ac46de6..043bcc3 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -63,10 +63,9 @@ body: |
bb.0:
; GCN-LABEL: name: salu16_usedby_salu32
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = COPY %0:vgpr_32
@@ -80,10 +79,9 @@ body: |
bb.0:
; GCN-LABEL: name: salu16_usedby_valu32
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit-def $scc, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = COPY %0:vgpr_32
@@ -97,7 +95,6 @@ body: |
bb.0:
; GCN-LABEL: name: salu32_usedby_salu16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
@@ -112,7 +109,6 @@ body: |
bb.0:
; GCN-LABEL: name: salu32_usedby_valu16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
@@ -155,7 +151,6 @@ body: |
bb.0:
; GCN-LABEL: name: copy_vgpr32_sreg32_usedby_valu16
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = COPY %0:vgpr_32
@@ -168,11 +163,10 @@ body: |
bb.0:
; GCN-LABEL: name: S_FMAC_F16
; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF1]], %subreg.hi16
; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
- ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
%0:vgpr_16 = IMPLICIT_DEF
%1:sgpr_lo16 = COPY %0:vgpr_16
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 8bc8eef..d662fce 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -11,7 +11,6 @@ body: |
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: fmac_f16
@@ -19,7 +18,6 @@ body: |
; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = IMPLICIT_DEF
@@ -36,13 +34,11 @@ body: |
; REAL16-LABEL: name: ceil_f16
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: ceil_f16
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_CEIL_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_fake16_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
@@ -57,13 +53,11 @@ body: |
; REAL16-LABEL: name: floor_f16
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: floor_f16
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; FAKE16-NEXT: [[V_FLOOR_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_fake16_e64 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 3d06fff..fc82faa 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefix=GCN %s
# GCN-LABEL: name: fix-sgpr-copies
@@ -190,7 +191,6 @@ body: |
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: %6:sreg_64_xexec = nofpexcept V_CMP_LT_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %6, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
@@ -212,7 +212,6 @@ body: |
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; GCN-NEXT: %6:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, [[DEF1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
@@ -238,8 +237,6 @@ body: |
# GCN-LABEL: name: s_cselect_b64
# GCN: %0:vgpr_32 = IMPLICIT_DEF
# GCN: %1:vreg_64 = IMPLICIT_DEF
-# GCN: %2:sreg_32 = IMPLICIT_DEF
-# GCN: %3:sreg_64 = IMPLICIT_DEF
# GCN: %7:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %7, implicit $exec
name: s_cselect_b64
@@ -252,3 +249,5 @@ body: |
S_CMP_EQ_U32 %2, 0, implicit-def $scc
%4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index 370b43a..497760c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -11,14 +11,10 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_intrinsic(ptr %ptr, double %d
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr %ptr, double %data)
@@ -34,11 +30,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_intrinsic(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
@@ -63,14 +55,10 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
@@ -86,14 +74,10 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr,
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
@@ -109,11 +93,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
@@ -138,11 +118,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr,
; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX90A_GFX942-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 8c7d5cf..ae5da3a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -503,9 +503,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
@@ -2827,9 +2828,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -8410,13 +8412,12 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8529,13 +8530,12 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -8785,13 +8785,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8908,13 +8907,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -9171,13 +9169,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -9295,13 +9292,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -9557,11 +9553,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -9671,11 +9667,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -9917,11 +9913,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10035,11 +10031,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -10288,11 +10284,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10407,11 +10403,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -10651,8 +10647,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10735,8 +10731,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -10925,10 +10921,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -11014,10 +11009,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -11220,13 +11214,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -11345,13 +11338,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -11610,11 +11602,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -11730,11 +11722,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -16769,14 +16761,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16826,9 +16815,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
@@ -16851,9 +16841,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
@@ -17341,13 +17332,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -17396,9 +17385,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
@@ -17421,9 +17411,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
@@ -19323,16 +19314,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start
@@ -19377,16 +19365,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start
@@ -19473,9 +19458,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -19516,9 +19502,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -20309,15 +20296,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start
@@ -20362,15 +20347,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start
@@ -20456,9 +20439,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -20499,9 +20483,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 56ad91d..6218a5c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -382,9 +382,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -409,9 +410,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -836,9 +838,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -863,9 +866,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@@ -1936,9 +1940,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -1963,9 +1968,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -2390,9 +2396,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -2417,9 +2424,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6043,14 +6051,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6168,14 +6176,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -6438,14 +6446,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6570,14 +6578,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -6847,14 +6855,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6980,14 +6988,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -7254,13 +7262,12 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7376,13 +7383,12 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -7638,13 +7644,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7767,13 +7772,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -8036,13 +8040,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8166,13 +8169,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -8424,11 +8426,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8519,11 +8521,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -8728,10 +8730,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8820,10 +8821,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -9035,14 +9035,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -9169,14 +9169,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -9448,13 +9448,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -9579,13 +9578,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -14777,15 +14775,12 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14838,9 +14833,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
@@ -14865,9 +14861,10 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
@@ -15492,14 +15489,12 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15552,9 +15547,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
@@ -15579,9 +15575,10 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
@@ -17225,16 +17222,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
@@ -17279,16 +17273,13 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
@@ -17375,9 +17366,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -17418,9 +17410,10 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -18577,15 +18570,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
@@ -18630,15 +18621,13 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
@@ -18724,9 +18713,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -18767,9 +18757,10 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index f0083bd..6eafbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -382,9 +382,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -409,9 +410,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -836,9 +838,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -863,9 +866,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@@ -1936,9 +1940,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -1963,9 +1968,10 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -2390,9 +2396,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -2417,9 +2424,10 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6043,14 +6051,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6168,14 +6176,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -6438,14 +6446,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6570,14 +6578,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -6847,14 +6855,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6980,14 +6988,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -7254,13 +7262,12 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7376,13 +7383,12 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -7638,13 +7644,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7767,13 +7772,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -8036,13 +8040,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8166,13 +8169,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -8424,11 +8426,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8519,11 +8521,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -8728,10 +8730,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8820,10 +8821,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -9035,14 +9035,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -9169,14 +9169,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -9448,13 +9448,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -9579,13 +9578,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc
@@ -14777,15 +14775,12 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_pk_max_f16 v1, v2, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14838,9 +14833,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2
@@ -14865,9 +14861,10 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v1, v2, v2
@@ -15492,14 +15489,12 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_pk_max_f16 v4, v2, v2
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15552,9 +15547,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2
@@ -15579,9 +15575,10 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_pk_max_f16 v2, v2, v2
@@ -17225,16 +17222,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
@@ -17279,16 +17273,13 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
@@ -17375,9 +17366,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -17418,9 +17410,10 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -18577,15 +18570,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
@@ -18630,15 +18621,13 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
@@ -18724,9 +18713,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -18767,9 +18757,10 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 3ee0bb2..25f29c8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -475,14 +475,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -532,9 +529,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start
@@ -557,9 +555,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
@@ -1068,13 +1067,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1123,9 +1120,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
@@ -1148,9 +1146,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
@@ -2083,14 +2082,11 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX11-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2140,9 +2136,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -2165,9 +2162,10 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -2676,13 +2674,11 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX11-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2731,9 +2727,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -2756,9 +2753,10 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
@@ -5855,13 +5853,12 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5974,13 +5971,12 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -6230,13 +6226,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6353,13 +6348,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -6616,13 +6610,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6740,13 +6733,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -7002,11 +6994,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7116,11 +7108,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -7362,11 +7354,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7480,11 +7472,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -7733,11 +7725,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7852,11 +7844,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -8096,10 +8088,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8185,10 +8176,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -8382,8 +8372,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8466,8 +8456,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -8665,13 +8655,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -8790,13 +8779,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
@@ -9055,11 +9043,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -9175,11 +9163,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
@@ -14311,14 +14299,11 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX11-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14368,9 +14353,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -14393,9 +14379,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start
@@ -14970,13 +14957,11 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX11-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15025,9 +15010,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
@@ -15050,9 +15036,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
@@ -16654,16 +16641,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
@@ -16708,16 +16692,13 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
@@ -16804,9 +16785,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -16847,9 +16829,10 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -18006,15 +17989,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
@@ -18059,15 +18040,13 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-FAKE16-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
@@ -18153,9 +18132,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -18196,9 +18176,10 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
new file mode 100644
index 0000000..e8efa85
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+
+; The first load produces address in a VGPR which is used in address calculation
+; of the second load (one inside the loop). The value is uniform and the inner
+; load correctly selected to use SADDR form, however the address is promoted to
+; vector registers because it all starts with a VGPR produced by the entry block
+; load.
+;
+; Check that we are changing SADDR form of a load to VADDR and do not have to use
+; readfirstlane instructions to move address from VGPRs into SGPRs.
+
+define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
+; GCN-LABEL: test_move_load_address_to_vgpr:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: global_load_b32 v2, v3, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[2:3]
+; GCN-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
+; GCN-NEXT: .LBB0_1: ; %bb3
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: s_wait_xcnt 0x0
+; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 4, v[0:1]
+; GCN-NEXT: v_add_co_u32 v2, s0, v2, 1
+; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT: s_cbranch_vccz .LBB0_1
+; GCN-NEXT: ; %bb.2: ; %bb2
+; GCN-NEXT: s_endpgm
+bb:
+ %i2 = load volatile i32, ptr addrspace(1) %arg1, align 4
+ br label %bb3
+
+bb2: ; preds = %bb3
+ ret void
+
+bb3: ; preds = %bb3, %bb
+ %i = phi i32 [ %i2, %bb ], [ %i8, %bb3 ]
+ %i4 = zext i32 %i to i64
+ %i5 = getelementptr inbounds i32, ptr %arg2, i64 %i4
+ %i6 = load volatile i32, ptr %i5, align 4
+ %i8 = add nuw nsw i32 %i, 1
+ %i9 = icmp eq i32 %i8, 256
+ br i1 %i9, label %bb2, label %bb3
+}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 2ff66c9..004d3c0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -252,13 +252,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -277,9 +279,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -292,15 +297,16 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -314,13 +320,17 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -344,11 +354,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -367,8 +379,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -381,18 +397,19 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -406,13 +423,17 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -433,11 +454,13 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -455,9 +478,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -465,13 +490,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -483,14 +509,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -508,10 +537,12 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -529,8 +560,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
@@ -538,16 +572,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -559,14 +594,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
@@ -642,13 +680,15 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -667,9 +707,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -683,15 +725,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -705,13 +748,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -736,11 +782,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -759,8 +807,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -774,18 +825,19 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -799,13 +851,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -827,11 +882,13 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -849,9 +906,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -862,13 +921,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -880,14 +940,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -908,10 +971,12 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -929,8 +994,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -941,16 +1009,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -962,14 +1031,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2
; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1048,13 +1120,15 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1073,9 +1147,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -1089,15 +1165,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1111,13 +1188,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2
; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -1142,11 +1222,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1165,8 +1247,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3]
@@ -1180,18 +1265,19 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1205,13 +1291,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2
; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5]
@@ -1233,11 +1322,13 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1255,9 +1346,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -1268,13 +1361,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1286,14 +1380,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2
; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1314,10 +1411,12 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1335,8 +1434,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
@@ -1347,16 +1449,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1368,14 +1471,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2
; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
@@ -1454,13 +1560,15 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1479,9 +1587,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
@@ -1496,15 +1606,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1518,13 +1629,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2
; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
@@ -1550,11 +1664,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1573,8 +1689,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
@@ -1589,18 +1708,19 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1614,13 +1734,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2
; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
@@ -1643,11 +1766,13 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1665,9 +1790,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
@@ -1679,13 +1806,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1697,14 +1825,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2
; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
@@ -1726,10 +1857,12 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1747,8 +1880,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
@@ -1760,16 +1896,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1781,14 +1918,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2
; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
@@ -1868,13 +2008,15 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1893,9 +2035,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
@@ -1910,15 +2054,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -1932,13 +2077,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2
; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
@@ -1964,11 +2112,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -1987,8 +2137,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
@@ -2003,18 +2156,19 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2028,13 +2182,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2
; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
@@ -2057,11 +2214,13 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2079,9 +2238,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
@@ -2093,13 +2254,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2111,14 +2273,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2
; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
@@ -2140,10 +2305,12 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2161,8 +2328,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
@@ -2174,16 +2344,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2195,14 +2366,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2
; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
@@ -2282,13 +2456,15 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2307,9 +2483,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
@@ -2324,15 +2502,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2346,13 +2525,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2
; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
@@ -2378,11 +2560,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2401,8 +2585,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
@@ -2417,18 +2604,19 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2442,13 +2630,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2
; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
@@ -2471,11 +2662,13 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2493,9 +2686,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
@@ -2507,13 +2702,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2525,14 +2721,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2
; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
@@ -2554,10 +2753,12 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2575,8 +2776,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
@@ -2588,16 +2792,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2609,14 +2814,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2
; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
@@ -2690,13 +2898,15 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2715,10 +2925,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3]
@@ -2732,15 +2944,16 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2753,15 +2966,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5]
@@ -2786,11 +3002,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2809,9 +3027,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3]
@@ -2825,18 +3046,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2849,15 +3071,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5]
@@ -2879,11 +3104,13 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2900,9 +3127,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
@@ -2913,13 +3142,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -2930,14 +3160,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
@@ -2958,10 +3191,12 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -2978,8 +3213,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
@@ -2990,16 +3228,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3010,14 +3249,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
@@ -3090,13 +3332,15 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3115,10 +3359,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3]
@@ -3132,15 +3378,16 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3153,15 +3400,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5]
@@ -3186,11 +3436,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3209,9 +3461,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3]
@@ -3225,18 +3480,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3249,15 +3505,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5]
@@ -3279,11 +3538,13 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3300,9 +3561,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
@@ -3313,13 +3576,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3330,14 +3594,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
@@ -3358,10 +3625,12 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3378,8 +3647,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3]
@@ -3390,16 +3662,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3410,14 +3683,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
@@ -3490,13 +3766,15 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3515,10 +3793,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3]
@@ -3532,15 +3812,16 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3553,15 +3834,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5]
@@ -3586,11 +3870,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3609,9 +3895,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3]
@@ -3625,18 +3914,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3649,15 +3939,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5]
@@ -3679,11 +3972,13 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3700,9 +3995,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
@@ -3713,13 +4010,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3730,14 +4028,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
@@ -3758,10 +4059,12 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3778,8 +4081,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3]
@@ -3790,16 +4096,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3810,14 +4117,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
@@ -3890,13 +4200,15 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -3915,10 +4227,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3]
@@ -3932,15 +4246,16 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -3953,15 +4268,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5]
@@ -3986,11 +4304,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4009,9 +4329,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3]
@@ -4025,18 +4348,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4049,15 +4373,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5]
@@ -4079,11 +4406,13 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4100,9 +4429,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
@@ -4113,13 +4444,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4130,14 +4462,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
@@ -4158,10 +4493,12 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4178,8 +4515,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3]
@@ -4190,16 +4530,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4210,14 +4551,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
@@ -4310,14 +4654,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4338,9 +4684,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4356,15 +4704,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4380,13 +4729,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2
; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4414,11 +4766,13 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4439,8 +4793,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4456,18 +4813,19 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4483,13 +4841,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2
; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4512,13 +4873,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4538,9 +4901,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4553,13 +4918,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4573,14 +4939,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2
; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4603,10 +4972,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4626,8 +4997,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
@@ -4640,16 +5014,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4663,14 +5038,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2
; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
@@ -4742,13 +5120,15 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4766,15 +5146,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -4786,15 +5167,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4806,21 +5188,24 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2
; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -4843,11 +5228,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4865,14 +5252,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
@@ -4884,18 +5273,19 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4907,21 +5297,24 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2
; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
@@ -4941,11 +5334,13 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -4961,14 +5356,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
@@ -4977,13 +5373,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -4993,20 +5390,23 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2
; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5025,10 +5425,12 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5044,13 +5446,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1]
; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4
; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
@@ -5059,16 +5463,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5078,20 +5483,23 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2
; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
@@ -5161,13 +5569,15 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5185,10 +5595,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5207,15 +5619,16 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5227,15 +5640,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_branch .LBB106_5
; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2
; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5265,11 +5681,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5
+; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5287,9 +5705,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5308,18 +5729,19 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7
+; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5331,15 +5753,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_branch .LBB107_5
; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2
; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5366,11 +5791,13 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5386,9 +5813,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5404,13 +5833,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5420,14 +5850,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2
; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5453,10 +5886,12 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1
; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4
; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
@@ -5472,8 +5907,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -5489,16 +5927,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3
; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow
@@ -5508,14 +5947,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
-; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2
; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo
+; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
index 32888d2..4e9a74a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -9,7 +9,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voff
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%voffset = load i32, ptr %voffset.ptr
%zext.offset = zext i32 %voffset to i64
@@ -24,7 +24,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbas
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:2047
+; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:2047 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%voffset = load i32, ptr %voffset.ptr
%zext.offset = zext i32 %voffset to i64
@@ -40,7 +40,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:-2048
+; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:-2048 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%voffset = load i32, ptr %voffset.ptr
%zext.offset = zext i32 %voffset to i64
@@ -65,7 +65,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
@@ -76,7 +76,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1
+; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
@@ -94,7 +94,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] offset:-120
+; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] offset:-120 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
@@ -105,7 +105,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 offset:-120
+; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 offset:-120 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
@@ -122,7 +122,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) {
; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -133,7 +133,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) {
; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -145,7 +145,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %s
define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) {
; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -156,7 +156,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) {
; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -168,7 +168,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %s
define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -179,7 +179,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -191,7 +191,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %s
define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) {
; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -202,7 +202,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) {
; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -214,7 +214,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %s
define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -225,7 +225,7 @@ define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voff
define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -238,13 +238,13 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -256,13 +256,13 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %s
; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -275,13 +275,13 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -293,13 +293,13 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %s
; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -312,13 +312,13 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -330,13 +330,13 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -349,13 +349,13 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -367,13 +367,13 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -386,13 +386,13 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -404,13 +404,13 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -423,13 +423,13 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -441,13 +441,13 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -460,13 +460,13 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -478,13 +478,13 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sb
; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
-; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -498,14 +498,14 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -518,14 +518,14 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -539,14 +539,14 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -559,14 +559,14 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -580,14 +580,14 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -600,14 +600,14 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -621,14 +621,14 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -641,14 +641,14 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3
-; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -662,14 +662,14 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -682,14 +682,14 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -703,14 +703,14 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -723,14 +723,14 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -744,14 +744,14 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -764,14 +764,14 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -785,14 +785,14 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -805,14 +805,14 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -826,14 +826,14 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -846,14 +846,14 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -867,14 +867,14 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %v
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -887,14 +887,14 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -908,14 +908,14 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -928,14 +928,14 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -949,14 +949,14 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3]
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -969,14 +969,14 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128
+; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
-; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128
+; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -1068,7 +1068,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr i
define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -1080,7 +1080,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i3
define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -1093,7 +1093,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr in
define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3]
+; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -1106,7 +1106,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %s
define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
+; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b2..fc88839 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3804
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0
+; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 3304dbf..57be290 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -805,7 +805,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -879,7 +879,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -962,7 +962,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1054,7 +1054,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1114,7 +1114,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1183,7 +1183,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1261,7 +1261,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1348,7 +1348,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1413,7 +1413,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1487,7 +1487,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1570,7 +1570,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1662,7 +1662,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1722,7 +1722,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1791,7 +1791,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -1869,7 +1869,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1956,7 +1956,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2017,7 +2017,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2090,7 +2090,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2169,7 +2169,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2260,7 +2260,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2316,7 +2316,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2384,7 +2384,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2458,7 +2458,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2544,7 +2544,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2605,7 +2605,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2678,7 +2678,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2757,7 +2757,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2848,7 +2848,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -2904,7 +2904,7 @@ define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2972,7 +2972,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3046,7 +3046,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3132,7 +3132,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3193,7 +3193,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3266,7 +3266,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3345,7 +3345,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3436,7 +3436,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3492,7 +3492,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3560,7 +3560,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3634,7 +3634,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3720,7 +3720,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3781,7 +3781,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3854,7 +3854,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -3933,7 +3933,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4024,7 +4024,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4080,7 +4080,7 @@ define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4148,7 +4148,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4222,7 +4222,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4308,7 +4308,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4373,7 +4373,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4447,7 +4447,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4530,7 +4530,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4622,7 +4622,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4682,7 +4682,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4751,7 +4751,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -4829,7 +4829,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4916,7 +4916,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -5653,7 +5653,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
+ %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6297,7 +6297,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6371,7 +6371,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -6454,7 +6454,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6546,7 +6546,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -6606,7 +6606,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6675,7 +6675,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -6753,7 +6753,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6840,7 +6840,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -9126,7 +9126,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9190,7 +9190,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1023
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9258,7 +9258,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1024
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9332,7 +9332,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -9415,7 +9415,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9507,7 +9507,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -9567,7 +9567,7 @@ define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9636,7 +9636,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -9714,7 +9714,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9801,7 +9801,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -9866,7 +9866,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9930,7 +9930,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1023
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9998,7 +9998,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 1024
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -10072,7 +10072,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %out, i32 4
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -10155,7 +10155,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -10247,7 +10247,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
%gep = getelementptr i32, ptr %ptr, i32 4
- %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -10307,7 +10307,7 @@ define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -10376,7 +10376,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -10454,7 +10454,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index)
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -10541,7 +10541,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr %out, i64 %index
- %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %val, ptr %out2
ret void
}
@@ -10851,3 +10851,5 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
store bfloat %val, ptr %out
ret void
}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 1311560..e74ad3d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -1061,25 +1061,64 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_sub_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_sub v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB30_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_sub v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB30_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_sub v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB30_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
ret void
@@ -1091,9 +1130,22 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_sub v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB31_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_noret_offset:
@@ -1101,17 +1153,43 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_sub v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB31_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB31_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1122,25 +1200,67 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_sub_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB32_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB32_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB32_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -1150,29 +1270,69 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_sub_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB33_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB33_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB33_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1185,10 +1345,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_sub v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB34_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar:
@@ -1196,10 +1368,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_sub v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB34_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar:
@@ -1207,10 +1391,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_sub v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB34_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
ret void
@@ -1224,10 +1420,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_sub v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB35_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
@@ -1237,10 +1445,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_sub v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB35_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
@@ -1248,10 +1468,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB35_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1264,10 +1496,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB36_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar:
@@ -1275,10 +1521,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB36_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar:
@@ -1286,10 +1546,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB36_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -1301,12 +1575,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB37_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
@@ -1314,12 +1600,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB37_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
@@ -1327,10 +1625,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB37_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1411,25 +1723,64 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_and_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_and v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB40_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_and v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB40_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_and v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB40_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
ret void
@@ -1441,9 +1792,22 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_and v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB41_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_noret_offset:
@@ -1451,17 +1815,43 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_and v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB41_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB41_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1472,25 +1862,67 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_and_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB42_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB42_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB42_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -1500,29 +1932,69 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_and_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_and_b32_e32 v0, v1, v2
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB43_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_and_b32_e32 v0, v1, v2
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB43_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_and_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB43_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1535,10 +2007,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_and v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB44_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_noret_scalar:
@@ -1546,10 +2030,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_and v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB44_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_noret_scalar:
@@ -1557,10 +2053,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_and v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB44_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
ret void
@@ -1574,10 +2082,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_and v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB45_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar:
@@ -1587,10 +2107,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_and v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB45_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar:
@@ -1598,10 +2130,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB45_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1614,10 +2158,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB46_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_ret_scalar:
@@ -1625,10 +2183,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB46_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_ret_scalar:
@@ -1636,10 +2208,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB46_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -1651,12 +2237,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB47_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar:
@@ -1664,12 +2262,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB47_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar:
@@ -1677,10 +2287,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_and_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB47_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -2532,25 +3156,64 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_or_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_or v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB60_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_or v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB60_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_or v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB60_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
ret void
@@ -2562,9 +3225,22 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_or v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB61_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_noret_offset:
@@ -2572,17 +3248,43 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_or v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB61_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB61_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2593,25 +3295,67 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_or_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB62_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB62_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB62_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -2621,29 +3365,69 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_or_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_or_b32_e32 v0, v1, v2
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB63_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_or_b32_e32 v0, v1, v2
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB63_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_or_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB63_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2656,10 +3440,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_or v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB64_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_noret_scalar:
@@ -2667,10 +3463,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_or v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB64_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_noret_scalar:
@@ -2678,10 +3486,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_or v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB64_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
ret void
@@ -2695,10 +3515,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_or v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB65_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar:
@@ -2708,10 +3540,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_or v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB65_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar:
@@ -2719,10 +3563,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB65_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2735,10 +3591,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB66_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_ret_scalar:
@@ -2746,10 +3616,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB66_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_ret_scalar:
@@ -2757,10 +3641,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB66_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -2772,12 +3670,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB67_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar:
@@ -2785,12 +3695,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB67_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar:
@@ -2798,10 +3720,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_or_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB67_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2882,25 +3818,64 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_xor_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_xor v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB70_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_xor v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB70_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_xor v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB70_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
ret void
@@ -2912,9 +3887,22 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_xor v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB71_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_noret_offset:
@@ -2922,17 +3910,43 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_xor v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB71_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB71_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -2943,25 +3957,67 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_xor_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB72_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB72_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB72_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -2971,29 +4027,69 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_xor_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB73_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB73_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB73_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -3006,10 +4102,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_xor v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB74_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar:
@@ -3017,10 +4125,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_xor v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB74_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar:
@@ -3028,10 +4148,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_xor v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB74_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
ret void
@@ -3045,10 +4177,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_xor v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB75_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
@@ -3058,10 +4202,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_xor v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB75_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
@@ -3069,10 +4225,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB75_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -3085,10 +4253,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB76_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar:
@@ -3096,10 +4278,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB76_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar:
@@ -3107,10 +4303,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB76_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -3122,12 +4332,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB77_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
@@ -3135,12 +4357,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB77_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
@@ -3148,10 +4382,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB77_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -4228,22 +5476,9 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB92_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_i32_e32 v3, v4, v2
-; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN1-NEXT: flat_atomic_smax v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v4, v3
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB92_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
@@ -4251,43 +5486,17 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB92_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_i32_e32 v3, v4, v2
-; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN2-NEXT: flat_atomic_smax v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v4, v3
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB92_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_max_i32_noret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB92_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB92_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -4298,69 +5507,29 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3
; GCN1-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v0, v[3:4]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB93_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_max_i32_e32 v0, v1, v2
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_atomic_smax v0, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB93_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v0, v[3:4]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB93_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_max_i32_e32 v0, v1, v2
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_atomic_smax v0, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB93_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB93_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: v_max_i32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB93_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -5281,22 +6450,9 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB105_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_max_u32_e32 v3, v4, v2
-; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN1-NEXT: flat_atomic_umax v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v4, v3
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB105_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
@@ -5304,43 +6460,17 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB105_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_max_u32_e32 v3, v4, v2
-; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN2-NEXT: flat_atomic_umax v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v4, v3
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB105_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_umax_i32_noret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB105_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB105_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -5351,69 +6481,29 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN1-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v0, v[3:4]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB106_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_max_u32_e32 v0, v1, v2
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_atomic_umax v0, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB106_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v0, v[3:4]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB106_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_max_u32_e32 v0, v1, v2
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_atomic_umax v0, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB106_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB106_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: v_max_u32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB106_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6022,22 +7112,9 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB115_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_u32_e32 v3, v4, v2
-; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN1-NEXT: flat_atomic_umin v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v4, v3
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB115_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
@@ -6045,43 +7122,17 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB115_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_u32_e32 v3, v4, v2
-; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN2-NEXT: flat_atomic_umin v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v4, v3
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB115_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_umin_i32_noret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB115_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB115_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6092,69 +7143,29 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN1-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v0, v[3:4]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB116_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_min_u32_e32 v0, v1, v2
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_atomic_umin v0, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB116_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v0, v[3:4]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB116_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_min_u32_e32 v0, v1, v2
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_atomic_umin v0, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB116_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB116_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: v_min_u32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB116_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7152,22 +8163,9 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB129_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_min_i32_e32 v3, v4, v2
-; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN1-NEXT: flat_atomic_smin v[0:1], v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v4, v3
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB129_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
@@ -7175,43 +8173,17 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB129_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_min_i32_e32 v3, v4, v2
-; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GCN2-NEXT: flat_atomic_smin v[0:1], v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v4, v3
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB129_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_min_i32_noret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB129_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB129_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7222,69 +8194,29 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3
; GCN1-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v0, v[3:4]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB130_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: v_min_i32_e32 v0, v1, v2
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_atomic_smin v0, v[0:1], v2 glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB130_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v0, v[3:4]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB130_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: v_min_i32_e32 v0, v1, v2
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_atomic_smin v0, v[0:1], v2 glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB130_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB130_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v4, v3
-; GCN3-NEXT: v_min_i32_e32 v3, v4, v2
-; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
+; GCN3-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB130_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i64 4
%result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7299,25 +8231,70 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_inc v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4
+; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB131_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_inc v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB131_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_inc v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_u32_e32 v3, 1, v4
+; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB131_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
ret void
@@ -7329,9 +8306,24 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_inc v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4
+; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB132_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
@@ -7339,17 +8331,47 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_inc v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB132_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_u32_e32 v3, 1, v4
+; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB132_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -7360,25 +8382,73 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4
+; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB133_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB133_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_add_u32_e32 v3, 1, v4
+; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB133_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -7388,29 +8458,75 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[4:5], 0
+; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1
+; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
+; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execnz .LBB134_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1
+; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
+; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execnz .LBB134_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_add_u32_e32 v3, 1, v4
+; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_cbranch_execnz .LBB134_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -7423,10 +8539,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_inc v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB135_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB135_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
@@ -7434,10 +8564,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_inc v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB135_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3
+; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB135_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
@@ -7445,10 +8589,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_inc v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB135_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_u32_e32 v2, 1, v3
+; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB135_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
ret void
@@ -7462,10 +8620,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_inc v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB136_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB136_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
@@ -7475,10 +8647,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_inc v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB136_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3
+; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB136_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
@@ -7486,10 +8672,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB136_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_u32_e32 v2, 1, v3
+; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB136_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -7502,10 +8702,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4
+; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB137_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
@@ -7513,10 +8729,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4
+; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB137_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
@@ -7524,10 +8756,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_add_u32_e32 v0, 1, v4
+; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB137_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -7539,12 +8787,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB138_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4
+; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB138_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
@@ -7552,12 +8814,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB138_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4
+; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB138_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
@@ -7565,10 +8841,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v0
+; GCN3-NEXT: v_add_u32_e32 v0, 1, v4
+; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB138_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -7649,25 +8941,76 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_dec v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB141_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_dec v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB141_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_dec v[0:1], v2
+; GCN3-NEXT: flat_load_dword v4, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB141_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
ret void
@@ -7679,9 +9022,26 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_dec v[0:1], v2
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB142_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
@@ -7689,17 +9049,51 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_dec v[0:1], v2
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB142_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB142_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -7710,25 +9104,79 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB143_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v4, v3
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB143_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v0, v3
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB143_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v4, v3
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB143_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v0, v3
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB143_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB143_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -7738,29 +9186,81 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0
+; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v0, v[3:4]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB144_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB144_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v0, v[3:4]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB144_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB144_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB144_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v4, v3
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GCN3-NEXT: v_add_u32_e32 v3, -1, v4
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB144_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v0, v3
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -7773,10 +9273,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_dec v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s6
+; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB145_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
@@ -7784,10 +9301,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_dec v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s6
+; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB145_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
@@ -7795,10 +9329,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_dec v[0:1], v2
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s6
+; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB145_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
ret void
@@ -7812,10 +9363,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_dec v[0:1], v2
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s6
+; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB146_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
@@ -7825,10 +9393,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
; GCN2-NEXT: s_addc_u32 s35, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_dec v[0:1], v2
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s6
+; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB146_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
@@ -7836,10 +9421,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
+; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s6
+; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; GCN3-NEXT: v_add_u32_e32 v2, -1, v3
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB146_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -7852,10 +9454,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN1-NEXT: flat_load_dword v0, v[0:1]
+; GCN1-NEXT: v_mov_b32_e32 v1, s4
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s6
+; GCN1-NEXT: v_mov_b32_e32 v2, s5
+; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v5, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB147_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
@@ -7863,10 +9484,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN2-NEXT: flat_load_dword v0, v[0:1]
+; GCN2-NEXT: v_mov_b32_e32 v1, s4
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s6
+; GCN2-NEXT: v_mov_b32_e32 v2, s5
+; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v5, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB147_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
@@ -7874,10 +9514,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1]
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v3, s6
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v0
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB147_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
ret i32 %result
@@ -7889,12 +9548,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s34, s4, 16
; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN1-NEXT: v_mov_b32_e32 v1, s34
+; GCN1-NEXT: v_mov_b32_e32 v2, s35
+; GCN1-NEXT: flat_load_dword v0, v[1:2]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s6
+; GCN1-NEXT: .LBB148_1: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v5, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB148_1
+; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
@@ -7902,12 +9578,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s34, s4, 16
; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; GCN2-NEXT: v_mov_b32_e32 v1, s34
+; GCN2-NEXT: v_mov_b32_e32 v2, s35
+; GCN2-NEXT: flat_load_dword v0, v[1:2]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s6
+; GCN2-NEXT: .LBB148_1: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v5, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB148_1
+; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
@@ -7915,10 +9608,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc
+; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16
+; GCN3-NEXT: v_mov_b32_e32 v1, s4
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v3, s6
+; GCN3-NEXT: v_mov_b32_e32 v2, s5
+; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v5, v0
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; GCN3-NEXT: v_add_u32_e32 v0, -1, v5
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB148_1
+; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr %out, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index ffe0596..d9a5962 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -1299,7 +1299,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1442,7 +1442,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -1594,7 +1594,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1743,7 +1743,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -1875,7 +1875,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2012,7 +2012,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -2157,7 +2157,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2300,7 +2300,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -2441,7 +2441,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2587,7 +2587,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -2742,7 +2742,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2894,7 +2894,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -3029,7 +3029,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3169,7 +3169,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -3317,7 +3317,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3463,7 +3463,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -3604,7 +3604,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3752,7 +3752,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -3907,7 +3907,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4061,7 +4061,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -4196,7 +4196,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4338,7 +4338,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -4486,7 +4486,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4634,7 +4634,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -4775,7 +4775,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4923,7 +4923,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -5078,7 +5078,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -5232,7 +5232,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -5367,7 +5367,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -5509,7 +5509,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -5657,7 +5657,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -5805,7 +5805,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -5946,7 +5946,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6094,7 +6094,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -6249,7 +6249,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6403,7 +6403,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -6538,7 +6538,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6680,7 +6680,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -6828,7 +6828,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -6976,7 +6976,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -7117,7 +7117,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -7265,7 +7265,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -7420,7 +7420,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -7574,7 +7574,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -7709,7 +7709,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -7851,7 +7851,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -7999,7 +7999,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -8147,7 +8147,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -8285,7 +8285,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -8428,7 +8428,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -8580,7 +8580,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -8729,7 +8729,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -8861,7 +8861,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -8998,7 +8998,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -9143,7 +9143,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -9286,7 +9286,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -10759,7 +10759,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -10902,7 +10902,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -11054,7 +11054,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -11203,7 +11203,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -11335,7 +11335,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -11472,7 +11472,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -11617,7 +11617,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -11760,7 +11760,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -14107,7 +14107,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -14260,7 +14260,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -14422,7 +14422,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -14581,7 +14581,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -14723,7 +14723,7 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -14870,7 +14870,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -15025,7 +15025,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -15178,7 +15178,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -15335,7 +15335,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -15499,7 +15499,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr %out, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -15670,7 +15670,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -15840,7 +15840,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
%gep = getelementptr i64, ptr %ptr, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -15991,7 +15991,7 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -16149,7 +16149,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
@@ -16313,7 +16313,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -16477,7 +16477,9 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr %out, i64 %index
- %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store i64 %tmp0, ptr %out2
ret void
}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
index 3c1bc95..757649c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
@@ -458,13 +458,25 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_offset:
@@ -473,13 +485,25 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB8_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_offset:
@@ -501,40 +525,66 @@ entry:
define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_and_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB9_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_offset:
@@ -561,40 +611,64 @@ entry:
define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB10_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_addr64_offset:
@@ -624,42 +698,68 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB11_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
@@ -689,27 +789,55 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_and_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB12_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB12_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64:
@@ -732,14 +860,29 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_and_b32_e32 v5, s5, v7
+; GFX7-NEXT: v_and_b32_e32 v4, s4, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB13_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -749,14 +892,29 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_and_b32_e32 v5, s5, v7
+; GFX8-NEXT: v_and_b32_e32 v4, s4, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB13_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -785,36 +943,60 @@ entry:
define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_and_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB14_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_addr64:
@@ -843,38 +1025,64 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_and_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_and_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_and_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB15_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
@@ -906,13 +1114,26 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB16_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_offset:
@@ -921,13 +1142,26 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB16_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_offset:
@@ -949,40 +1183,68 @@ entry:
define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_sub_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB17_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_offset:
@@ -1009,40 +1271,66 @@ entry:
define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
@@ -1072,42 +1360,70 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB19_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB19_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
@@ -1137,27 +1453,57 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_sub_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB20_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB20_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64:
@@ -1180,14 +1526,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v1
+; GFX7-NEXT: v_mov_b32_e32 v7, v0
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1197,14 +1559,30 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1233,36 +1611,62 @@ entry:
define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_sub_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s2, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB22_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB22_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_addr64:
@@ -1291,38 +1695,66 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s4, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB23_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_sub_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s4, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB23_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
@@ -1354,12 +1786,27 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_offset:
@@ -1368,12 +1815,27 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB24_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_offset:
@@ -1395,40 +1857,70 @@ entry:
define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_max_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB25_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB25_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_offset:
@@ -1455,38 +1947,68 @@ entry:
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB26_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB26_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_addr64_offset:
@@ -1516,42 +2038,72 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB27_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB27_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
@@ -1581,25 +2133,59 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_max_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
@@ -1622,16 +2208,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB29_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
@@ -1639,16 +2242,33 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB29_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1675,34 +2295,64 @@ entry:
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_max_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB30_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB30_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_addr64:
@@ -1731,38 +2381,68 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB31_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_max_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB31_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
@@ -1794,12 +2474,27 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB32_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_offset:
@@ -1808,12 +2503,27 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB32_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_offset:
@@ -1835,40 +2545,70 @@ entry:
define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umax_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB33_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB33_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_offset:
@@ -1895,38 +2635,68 @@ entry:
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB34_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB34_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
@@ -1956,42 +2726,72 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB35_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB35_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
@@ -2021,25 +2821,59 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umax_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB36_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB36_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
@@ -2062,16 +2896,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB37_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
@@ -2079,16 +2930,33 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB37_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -2115,34 +2983,64 @@ entry:
define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umax_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB38_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB38_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_addr64:
@@ -2171,38 +3069,68 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB39_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umax_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB39_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
@@ -2234,12 +3162,27 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB40_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_offset:
@@ -2248,12 +3191,27 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB40_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_offset:
@@ -2275,40 +3233,70 @@ entry:
define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_min_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB41_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB41_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_offset:
@@ -2335,38 +3323,68 @@ entry:
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB42_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB42_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_addr64_offset:
@@ -2396,42 +3414,72 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB43_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
@@ -2461,25 +3509,59 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_min_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB44_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB44_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
@@ -2502,16 +3584,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB45_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
@@ -2519,16 +3618,33 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB45_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -2555,34 +3671,64 @@ entry:
define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_min_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB46_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB46_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_addr64:
@@ -2611,38 +3757,68 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB47_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_min_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB47_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
@@ -2674,12 +3850,27 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB48_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_offset:
@@ -2688,12 +3879,27 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB48_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_offset:
@@ -2715,40 +3921,70 @@ entry:
define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_umin_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB49_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB49_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_offset:
@@ -2775,38 +4011,68 @@ entry:
define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB50_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB50_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
@@ -2836,42 +4102,72 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB51_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB51_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
@@ -2901,25 +4197,59 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_umin_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB52_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB52_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
@@ -2942,16 +4272,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB53_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm
;
@@ -2959,16 +4306,33 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB53_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -2995,34 +4359,64 @@ entry:
define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_umin_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s3
+; GFX7-NEXT: v_mov_b32_e32 v7, s2
+; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB54_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB54_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_addr64:
@@ -3051,38 +4445,68 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB55_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_umin_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB55_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
@@ -3114,13 +4538,25 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB56_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_offset:
@@ -3129,13 +4565,25 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB56_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_offset:
@@ -3157,40 +4605,66 @@ entry:
define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_or_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB57_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB57_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_offset:
@@ -3217,40 +4691,64 @@ entry:
define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB58_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB58_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_addr64_offset:
@@ -3280,42 +4778,68 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB59_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB59_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
@@ -3345,27 +4869,55 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_or_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB60_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB60_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64:
@@ -3388,14 +4940,29 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_or_b32_e32 v5, s5, v7
+; GFX7-NEXT: v_or_b32_e32 v4, s4, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB61_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3405,14 +4972,29 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_or_b32_e32 v5, s5, v7
+; GFX8-NEXT: v_or_b32_e32 v4, s4, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB61_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3441,36 +5023,60 @@ entry:
define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_or_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB62_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB62_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_addr64:
@@ -3499,38 +5105,64 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB63_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_or_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_or_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB63_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
@@ -4104,13 +5736,25 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB74_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_offset:
@@ -4119,13 +5763,25 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB74_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_offset:
@@ -4147,40 +5803,66 @@ entry:
define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_xor_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB75_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB75_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_offset:
@@ -4207,40 +5889,64 @@ entry:
define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_addr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB76_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_addr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB76_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
@@ -4270,42 +5976,68 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB77_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB77_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
@@ -4335,27 +6067,55 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_xor_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB78_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB78_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64:
@@ -4378,14 +6138,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v3, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_xor_b32_e32 v5, s5, v7
+; GFX7-NEXT: v_xor_b32_e32 v4, s4, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB79_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4395,14 +6170,29 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_xor_b32_e32 v5, s5, v7
+; GFX8-NEXT: v_xor_b32_e32 v4, s4, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB79_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4431,36 +6221,60 @@ entry:
define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_xor_i64_addr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB80_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_addr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s3, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s2, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB80_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_addr64:
@@ -4489,38 +6303,64 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX7-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB81_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_xor_i64_ret_addr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s5, v5
+; GFX8-NEXT: v_xor_b32_e32 v2, s4, v4
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB81_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
@@ -5920,13 +7760,28 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB107_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_offset:
@@ -5935,13 +7790,28 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB107_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_offset:
@@ -5963,40 +7833,72 @@ entry:
define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_inc_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB108_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB108_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_offset:
@@ -6023,40 +7925,70 @@ entry:
define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_incr64_offset:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB109_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_incr64_offset:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB109_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
@@ -6086,42 +8018,74 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB110_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB110_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
@@ -6151,27 +8115,61 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_inc_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB111_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB111_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64:
@@ -6194,34 +8192,66 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB112_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB112_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret:
@@ -6247,36 +8277,66 @@ entry:
define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_inc_i64_incr64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s4
+; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB113_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_incr64:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s4
+; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB113_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_incr64:
@@ -6305,38 +8365,70 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT: s_add_u32 s0, s0, s6
+; GFX7-NEXT: s_addc_u32 s1, s1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[0:1], 0
+; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v4
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT: s_cbranch_execnz .LBB114_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_inc_i64_ret_incr64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT: s_add_u32 s0, s0, s6
+; GFX8-NEXT: s_addc_u32 s1, s1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[0:1], 0
+; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v3
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_cbranch_execnz .LBB114_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
@@ -6364,32 +8456,70 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_add_u32 s0, s0, 32
-; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s0, s4, 32
+; GFX7-NEXT: s_addc_u32 s1, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB115_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB115_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_offset:
@@ -6411,40 +8541,80 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_ret_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: s_add_u32 s0, s0, 32
-; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_add_u32 s0, s8, 32
+; GFX7-NEXT: s_addc_u32 s1, s9, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB116_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_add_u32 s0, s8, 32
+; GFX8-NEXT: s_addc_u32 s1, s9, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB116_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_offset:
@@ -6471,40 +8641,78 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_decr64_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT: s_add_u32 s0, s4, s0
+; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB117_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_decr64_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT: s_add_u32 s0, s4, s0
+; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB117_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
@@ -6532,44 +8740,84 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX7-NEXT: s_add_u32 s0, s4, s0
+; GFX7-NEXT: s_addc_u32 s1, s5, s1
; GFX7-NEXT: s_add_u32 s0, s0, 32
; GFX7-NEXT: s_addc_u32 s1, s1, 0
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
+; GFX7-NEXT: v_mov_b32_e32 v5, s8
+; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB118_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX8-NEXT: s_add_u32 s0, s4, s0
+; GFX8-NEXT: s_addc_u32 s1, s5, s1
; GFX8-NEXT: s_add_u32 s0, s0, 32
; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: v_mov_b32_e32 v5, s8
+; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB118_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
@@ -6598,28 +8846,70 @@ entry:
define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX7-LABEL: atomic_dec_i64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB119_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB119_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64:
@@ -6640,36 +8930,76 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX7-LABEL: atomic_dec_i64_ret:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v4, s5
+; GFX7-NEXT: v_mov_b32_e32 v5, s4
+; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB120_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, s10
+; GFX7-NEXT: v_mov_b32_e32 v1, s11
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB120_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret:
@@ -6695,36 +9025,74 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_decr64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT: s_add_u32 s0, s4, s0
+; GFX7-NEXT: s_addc_u32 s1, s5, s1
+; GFX7-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[2:3], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB121_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_decr64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT: s_add_u32 s0, s4, s0
+; GFX8-NEXT: s_addc_u32 s1, s5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[2:3], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v3, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB121_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_decr64:
@@ -6751,40 +9119,80 @@ entry:
define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
; GFX7-LABEL: atomic_dec_i64_ret_decr64:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GFX7-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s4
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX7-NEXT: s_add_u32 s0, s0, s4
-; GFX7-NEXT: s_addc_u32 s1, s1, s5
-; GFX7-NEXT: v_mov_b32_e32 v3, s1
-; GFX7-NEXT: v_mov_b32_e32 v2, s0
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX7-NEXT: s_add_u32 s0, s4, s0
+; GFX7-NEXT: s_addc_u32 s1, s5, s1
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
+; GFX7-NEXT: v_mov_b32_e32 v5, s8
+; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v2
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v2, s[2:3], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB122_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_dec_i64_ret_decr64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX8-NEXT: s_add_u32 s0, s4, s0
+; GFX8-NEXT: s_addc_u32 s1, s5, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: v_mov_b32_e32 v5, s8
+; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v3
+; GFX8-NEXT: v_mov_b32_e32 v8, v2
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[0:1], s[8:9], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v2, s[2:3], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v3, s[2:3], -1, v9, s[2:3]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB122_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 23dfe2f..524100c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -3633,21 +3633,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB30_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB30_4
+; GCN1-NEXT: s_cbranch_execnz .LBB30_6
; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB30_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB30_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: ; implicit-def: $vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB30_2
-; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB30_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3673,21 +3692,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB30_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB30_4
+; GCN2-NEXT: s_cbranch_execnz .LBB30_6
; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB30_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB30_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: ; implicit-def: $vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB30_2
-; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB30_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3711,21 +3749,37 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB30_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB30_4
+; GCN3-NEXT: s_cbranch_execnz .LBB30_6
; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB30_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB30_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2
+; GCN3-NEXT: ; implicit-def: $vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB30_2
-; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB30_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3756,21 +3810,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB31_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB31_4
+; GCN1-NEXT: s_cbranch_execnz .LBB31_6
; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB31_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB31_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: ; implicit-def: $vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB31_2
-; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB31_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3798,21 +3871,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB31_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB31_4
+; GCN2-NEXT: s_cbranch_execnz .LBB31_6
; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB31_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB31_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: ; implicit-def: $vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB31_2
-; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB31_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3838,21 +3930,37 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB31_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB31_4
+; GCN3-NEXT: s_cbranch_execnz .LBB31_6
; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB31_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB31_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2
+; GCN3-NEXT: ; implicit-def: $vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB31_2
-; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB31_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
@@ -3877,41 +3985,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB32_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB32_4
-; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_cbranch_execz .LBB32_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB32_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: .LBB32_4: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB32_2
-; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: s_cbranch_execz .LBB32_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
+; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v4, v2
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB32_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -3920,41 +4043,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB32_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB32_4
-; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_cbranch_execz .LBB32_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB32_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: .LBB32_4: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB32_2
-; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: s_cbranch_execz .LBB32_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
+; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v4, v2
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB32_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -3969,21 +4107,37 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB32_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB32_4
+; GCN3-NEXT: s_cbranch_execnz .LBB32_6
; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB32_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2
+; GCN3-NEXT: ; implicit-def: $vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB32_2
-; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB32_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -4015,21 +4169,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB33_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB33_4
+; GCN1-NEXT: s_cbranch_execnz .LBB33_6
; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2
+; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB33_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: ; implicit-def: $vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB33_2
-; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB33_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -4058,21 +4231,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB33_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB33_4
+; GCN2-NEXT: s_cbranch_execnz .LBB33_6
; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2
+; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB33_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: ; implicit-def: $vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB33_2
-; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB33_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -4099,21 +4291,37 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB33_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB33_4
+; GCN3-NEXT: s_cbranch_execnz .LBB33_6
; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB33_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2
+; GCN3-NEXT: ; implicit-def: $vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB33_2
-; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB33_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -4144,21 +4352,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB34_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB34_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB34_6
; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: .LBB34_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
+; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB34_2
-; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB34_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB34_2
+; GCN1-NEXT: .LBB34_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: v_mov_b32_e32 v4, s7
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
@@ -4188,21 +4415,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB34_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB34_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB34_6
; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: .LBB34_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB34_2
-; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB34_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB34_2
+; GCN2-NEXT: .LBB34_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -4229,21 +4475,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB34_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB34_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB34_6
; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: .LBB34_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB34_2
-; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB34_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB34_2
+; GCN3-NEXT: .LBB34_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -4276,21 +4536,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN1-NEXT: s_mov_b64 s[36:37], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB35_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB35_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_vccnz .LBB35_6
; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v4, s34
+; GCN1-NEXT: v_mov_b32_e32 v5, s35
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: .LBB35_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
+; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB35_2
-; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB35_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB35_2
+; GCN1-NEXT: .LBB35_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: v_mov_b32_e32 v4, s7
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
@@ -4322,21 +4601,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN2-NEXT: s_mov_b64 s[36:37], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB35_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB35_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_vccnz .LBB35_6
; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v4, s34
+; GCN2-NEXT: v_mov_b32_e32 v5, s35
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: .LBB35_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB35_2
-; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB35_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB35_2
+; GCN2-NEXT: .LBB35_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -4365,21 +4663,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_mov_b64 s[36:37], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB35_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB35_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_vccnz .LBB35_6
; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: .LBB35_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB35_2
-; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB35_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB35_2
+; GCN3-NEXT: .LBB35_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -4409,20 +4721,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB36_2
+; GCN1-NEXT: s_cbranch_vccz .LBB36_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v8, v1
+; GCN1-NEXT: v_mov_b32_e32 v7, v0
+; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
+; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB36_3
-; GCN1-NEXT: s_branch .LBB36_4
-; GCN1-NEXT: .LBB36_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB36_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB36_6
+; GCN1-NEXT: .LBB36_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB36_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: v_mov_b32_e32 v4, s7
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
@@ -4438,7 +4769,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB36_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -4451,20 +4782,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB36_2
+; GCN2-NEXT: s_cbranch_vccz .LBB36_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v8, v1
+; GCN2-NEXT: v_mov_b32_e32 v7, v0
+; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB36_3
-; GCN2-NEXT: s_branch .LBB36_4
-; GCN2-NEXT: .LBB36_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB36_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB36_6
+; GCN2-NEXT: .LBB36_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB36_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -4479,7 +4829,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB36_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -4490,20 +4840,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB36_2
+; GCN3-NEXT: s_cbranch_vccz .LBB36_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v8, v1
+; GCN3-NEXT: v_mov_b32_e32 v7, v0
+; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
+; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB36_3
-; GCN3-NEXT: s_branch .LBB36_4
-; GCN3-NEXT: .LBB36_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB36_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB36_6
+; GCN3-NEXT: .LBB36_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB36_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -4516,7 +4880,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB36_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst
@@ -4535,20 +4899,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cmp_eq_u32 s35, s36
; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB37_2
+; GCN1-NEXT: s_cbranch_vccz .LBB37_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v2, s34
+; GCN1-NEXT: v_mov_b32_e32 v3, s35
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v8, v1
+; GCN1-NEXT: v_mov_b32_e32 v7, v0
+; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
+; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB37_3
-; GCN1-NEXT: s_branch .LBB37_4
-; GCN1-NEXT: .LBB37_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB37_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB37_6
+; GCN1-NEXT: .LBB37_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB37_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: v_mov_b32_e32 v4, s7
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
@@ -4564,7 +4947,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB37_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -4579,20 +4962,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cmp_eq_u32 s35, s36
; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB37_2
+; GCN2-NEXT: s_cbranch_vccz .LBB37_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v2, s34
+; GCN2-NEXT: v_mov_b32_e32 v3, s35
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v8, v1
+; GCN2-NEXT: v_mov_b32_e32 v7, v0
+; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB37_3
-; GCN2-NEXT: s_branch .LBB37_4
-; GCN2-NEXT: .LBB37_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB37_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB37_6
+; GCN2-NEXT: .LBB37_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB37_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -4607,7 +5009,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB37_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -4620,20 +5022,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cmp_eq_u32 s35, s37
; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB37_2
+; GCN3-NEXT: s_cbranch_vccz .LBB37_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v8, v1
+; GCN3-NEXT: v_mov_b32_e32 v7, v0
+; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
+; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB37_3
-; GCN3-NEXT: s_branch .LBB37_4
-; GCN3-NEXT: .LBB37_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB37_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB37_6
+; GCN3-NEXT: .LBB37_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB37_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -4646,7 +5062,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB37_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
@@ -4928,21 +5344,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB40_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB40_4
+; GCN1-NEXT: s_cbranch_execnz .LBB40_6
; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB40_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB40_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB40_2
-; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB40_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -4968,21 +5403,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB40_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB40_4
+; GCN2-NEXT: s_cbranch_execnz .LBB40_6
; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB40_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB40_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB40_2
-; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB40_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -5006,21 +5460,37 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB40_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB40_4
+; GCN3-NEXT: s_cbranch_execnz .LBB40_6
; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB40_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB40_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB40_2
-; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB40_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -5051,21 +5521,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB41_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB41_4
+; GCN1-NEXT: s_cbranch_execnz .LBB41_6
; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB41_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB41_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB41_2
-; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB41_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -5093,21 +5582,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB41_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB41_4
+; GCN2-NEXT: s_cbranch_execnz .LBB41_6
; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB41_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB41_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB41_2
-; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB41_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -5133,21 +5641,37 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB41_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB41_4
+; GCN3-NEXT: s_cbranch_execnz .LBB41_6
; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB41_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB41_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB41_2
-; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB41_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -5172,41 +5696,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB42_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB42_4
-; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_cbranch_execz .LBB42_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB42_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB42_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: .LBB42_4: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB42_2
-; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: s_cbranch_execz .LBB42_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_and_b32_e32 v2, v0, v2
+; GCN1-NEXT: v_and_b32_e32 v2, v4, v2
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_and_b32_e32 v3, v1, v3
-; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN1-NEXT: v_and_b32_e32 v3, v5, v3
+; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB42_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -5215,41 +5754,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB42_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB42_4
-; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_cbranch_execz .LBB42_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB42_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_and_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_and_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB42_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: .LBB42_4: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB42_2
-; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: s_cbranch_execz .LBB42_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_and_b32_e32 v2, v0, v2
+; GCN2-NEXT: v_and_b32_e32 v2, v4, v2
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_and_b32_e32 v3, v1, v3
-; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN2-NEXT: v_and_b32_e32 v3, v5, v3
+; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB42_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -5264,21 +5818,37 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB42_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB42_4
+; GCN3-NEXT: s_cbranch_execnz .LBB42_6
; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB42_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_and_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_and_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB42_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB42_2
-; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB42_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -5310,21 +5880,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB43_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB43_4
+; GCN1-NEXT: s_cbranch_execnz .LBB43_6
; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB43_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_and_b32_e32 v7, v9, v3
+; GCN1-NEXT: v_and_b32_e32 v6, v8, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB43_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB43_2
-; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB43_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -5353,21 +5942,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB43_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB43_4
+; GCN2-NEXT: s_cbranch_execnz .LBB43_6
; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB43_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_and_b32_e32 v7, v9, v3
+; GCN2-NEXT: v_and_b32_e32 v6, v8, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB43_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB43_2
-; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB43_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -5394,21 +6002,37 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB43_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB43_4
+; GCN3-NEXT: s_cbranch_execnz .LBB43_6
; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB43_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_and_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_and_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB43_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB43_2
-; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB43_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -5439,21 +6063,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB44_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB44_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB44_6
; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB44_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB44_2
-; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB44_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB44_2
+; GCN1-NEXT: .LBB44_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -5482,21 +6124,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB44_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB44_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB44_6
; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB44_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB44_2
-; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB44_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB44_2
+; GCN2-NEXT: .LBB44_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -5522,21 +6182,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB44_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB44_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB44_6
; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB44_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB44_2
-; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB44_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB44_2
+; GCN3-NEXT: .LBB44_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -5568,21 +6241,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN1-NEXT: s_mov_b64 s[36:37], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB45_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB45_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_vccnz .LBB45_6
; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v4, s34
+; GCN1-NEXT: v_mov_b32_e32 v5, s35
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB45_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB45_2
-; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB45_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB45_2
+; GCN1-NEXT: .LBB45_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -5613,21 +6304,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN2-NEXT: s_mov_b64 s[36:37], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB45_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB45_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_vccnz .LBB45_6
; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v4, s34
+; GCN2-NEXT: v_mov_b32_e32 v5, s35
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB45_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB45_2
-; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB45_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB45_2
+; GCN2-NEXT: .LBB45_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -5655,21 +6364,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_mov_b64 s[36:37], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB45_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB45_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_vccnz .LBB45_6
; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB45_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_and_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_and_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB45_2
-; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB45_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB45_2
+; GCN3-NEXT: .LBB45_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -5698,20 +6420,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB46_2
+; GCN1-NEXT: s_cbranch_vccz .LBB46_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB46_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB46_3
-; GCN1-NEXT: s_branch .LBB46_4
-; GCN1-NEXT: .LBB46_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB46_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB46_6
+; GCN1-NEXT: .LBB46_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB46_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -5726,7 +6466,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB46_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -5739,20 +6479,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB46_2
+; GCN2-NEXT: s_cbranch_vccz .LBB46_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB46_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB46_3
-; GCN2-NEXT: s_branch .LBB46_4
-; GCN2-NEXT: .LBB46_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB46_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB46_6
+; GCN2-NEXT: .LBB46_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB46_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -5766,7 +6524,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB46_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -5777,20 +6535,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB46_2
+; GCN3-NEXT: s_cbranch_vccz .LBB46_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB46_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB46_3
-; GCN3-NEXT: s_branch .LBB46_4
-; GCN3-NEXT: .LBB46_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB46_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB46_6
+; GCN3-NEXT: .LBB46_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB46_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -5802,7 +6573,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB46_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr %ptr, i64 %in seq_cst
@@ -5821,20 +6592,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cmp_eq_u32 s35, s36
; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB47_2
+; GCN1-NEXT: s_cbranch_vccz .LBB47_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v2, s34
+; GCN1-NEXT: v_mov_b32_e32 v3, s35
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB47_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB47_3
-; GCN1-NEXT: s_branch .LBB47_4
-; GCN1-NEXT: .LBB47_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB47_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB47_6
+; GCN1-NEXT: .LBB47_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB47_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -5849,7 +6638,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: v_and_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB47_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -5864,20 +6653,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cmp_eq_u32 s35, s36
; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB47_2
+; GCN2-NEXT: s_cbranch_vccz .LBB47_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v2, s34
+; GCN2-NEXT: v_mov_b32_e32 v3, s35
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB47_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB47_3
-; GCN2-NEXT: s_branch .LBB47_4
-; GCN2-NEXT: .LBB47_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB47_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB47_6
+; GCN2-NEXT: .LBB47_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB47_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -5891,7 +6698,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: v_and_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB47_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -5904,20 +6711,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cmp_eq_u32 s35, s37
; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB47_2
+; GCN3-NEXT: s_cbranch_vccz .LBB47_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB47_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_and_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_and_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB47_3
-; GCN3-NEXT: s_branch .LBB47_4
-; GCN3-NEXT: .LBB47_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB47_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB47_6
+; GCN3-NEXT: .LBB47_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB47_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -5929,7 +6749,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: v_and_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB47_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
@@ -8126,21 +8946,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB60_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB60_4
+; GCN1-NEXT: s_cbranch_execnz .LBB60_6
; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB60_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB60_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB60_2
-; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB60_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -8166,21 +9005,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB60_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB60_4
+; GCN2-NEXT: s_cbranch_execnz .LBB60_6
; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB60_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB60_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB60_2
-; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB60_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -8204,21 +9062,37 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB60_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB60_4
+; GCN3-NEXT: s_cbranch_execnz .LBB60_6
; GCN3-NEXT: .LBB60_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB60_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB60_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB60_2
-; GCN3-NEXT: .LBB60_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB60_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -8249,21 +9123,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB61_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB61_4
+; GCN1-NEXT: s_cbranch_execnz .LBB61_6
; GCN1-NEXT: .LBB61_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB61_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB61_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB61_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB61_2
-; GCN1-NEXT: .LBB61_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB61_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -8291,21 +9184,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB61_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB61_4
+; GCN2-NEXT: s_cbranch_execnz .LBB61_6
; GCN2-NEXT: .LBB61_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB61_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB61_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB61_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB61_2
-; GCN2-NEXT: .LBB61_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB61_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -8331,21 +9243,37 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB61_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB61_4
+; GCN3-NEXT: s_cbranch_execnz .LBB61_6
; GCN3-NEXT: .LBB61_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB61_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB61_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB61_2
-; GCN3-NEXT: .LBB61_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB61_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -8370,41 +9298,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB62_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB62_4
-; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_cbranch_execz .LBB62_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB62_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB62_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: .LBB62_4: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB62_2
-; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: s_cbranch_execz .LBB62_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_or_b32_e32 v2, v0, v2
+; GCN1-NEXT: v_or_b32_e32 v2, v4, v2
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_or_b32_e32 v3, v1, v3
-; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN1-NEXT: v_or_b32_e32 v3, v5, v3
+; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB62_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -8413,41 +9356,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB62_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB62_4
-; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_cbranch_execz .LBB62_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB62_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_or_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_or_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB62_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: .LBB62_4: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB62_2
-; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: s_cbranch_execz .LBB62_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_or_b32_e32 v2, v0, v2
+; GCN2-NEXT: v_or_b32_e32 v2, v4, v2
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_or_b32_e32 v3, v1, v3
-; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN2-NEXT: v_or_b32_e32 v3, v5, v3
+; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB62_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -8462,21 +9420,37 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB62_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB62_4
+; GCN3-NEXT: s_cbranch_execnz .LBB62_6
; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_or_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_or_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB62_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB62_2
-; GCN3-NEXT: .LBB62_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -8508,21 +9482,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB63_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB63_4
+; GCN1-NEXT: s_cbranch_execnz .LBB63_6
; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_or_b32_e32 v7, v9, v3
+; GCN1-NEXT: v_or_b32_e32 v6, v8, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB63_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB63_2
-; GCN1-NEXT: .LBB63_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -8551,21 +9544,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB63_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB63_4
+; GCN2-NEXT: s_cbranch_execnz .LBB63_6
; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_or_b32_e32 v7, v9, v3
+; GCN2-NEXT: v_or_b32_e32 v6, v8, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB63_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB63_2
-; GCN2-NEXT: .LBB63_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -8592,21 +9604,37 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB63_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB63_4
+; GCN3-NEXT: s_cbranch_execnz .LBB63_6
; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_or_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_or_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB63_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB63_2
-; GCN3-NEXT: .LBB63_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -8637,21 +9665,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB64_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB64_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB64_6
; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB64_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB64_2
-; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB64_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB64_2
+; GCN1-NEXT: .LBB64_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -8680,21 +9726,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB64_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB64_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB64_6
; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB64_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB64_2
-; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB64_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB64_2
+; GCN2-NEXT: .LBB64_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -8720,21 +9784,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB64_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB64_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB64_6
; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB64_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB64_2
-; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB64_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB64_2
+; GCN3-NEXT: .LBB64_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -8766,21 +9843,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN1-NEXT: s_mov_b64 s[36:37], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB65_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB65_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_vccnz .LBB65_6
; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v4, s34
+; GCN1-NEXT: v_mov_b32_e32 v5, s35
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB65_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB65_2
-; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB65_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB65_2
+; GCN1-NEXT: .LBB65_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -8811,21 +9906,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN2-NEXT: s_mov_b64 s[36:37], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB65_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB65_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_vccnz .LBB65_6
; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v4, s34
+; GCN2-NEXT: v_mov_b32_e32 v5, s35
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB65_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB65_2
-; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB65_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB65_2
+; GCN2-NEXT: .LBB65_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -8853,21 +9966,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_mov_b64 s[36:37], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB65_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB65_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_vccnz .LBB65_6
; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB65_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_or_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_or_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB65_2
-; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB65_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB65_2
+; GCN3-NEXT: .LBB65_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -8896,20 +10022,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB66_2
+; GCN1-NEXT: s_cbranch_vccz .LBB66_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB66_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB66_3
-; GCN1-NEXT: s_branch .LBB66_4
-; GCN1-NEXT: .LBB66_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB66_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB66_6
+; GCN1-NEXT: .LBB66_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB66_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB66_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -8924,7 +10068,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN1-NEXT: v_or_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB66_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB66_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -8937,20 +10081,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB66_2
+; GCN2-NEXT: s_cbranch_vccz .LBB66_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB66_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB66_3
-; GCN2-NEXT: s_branch .LBB66_4
-; GCN2-NEXT: .LBB66_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB66_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB66_6
+; GCN2-NEXT: .LBB66_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB66_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB66_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -8964,7 +10126,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN2-NEXT: v_or_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB66_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB66_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -8975,20 +10137,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB66_2
+; GCN3-NEXT: s_cbranch_vccz .LBB66_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB66_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB66_3
-; GCN3-NEXT: s_branch .LBB66_4
-; GCN3-NEXT: .LBB66_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB66_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB66_6
+; GCN3-NEXT: .LBB66_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB66_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB66_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -9000,7 +10175,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GCN3-NEXT: v_or_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB66_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB66_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr %ptr, i64 %in seq_cst
@@ -9019,20 +10194,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cmp_eq_u32 s35, s36
; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB67_2
+; GCN1-NEXT: s_cbranch_vccz .LBB67_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v2, s34
+; GCN1-NEXT: v_mov_b32_e32 v3, s35
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB67_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB67_3
-; GCN1-NEXT: s_branch .LBB67_4
-; GCN1-NEXT: .LBB67_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB67_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB67_6
+; GCN1-NEXT: .LBB67_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB67_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -9047,7 +10240,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: v_or_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB67_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -9062,20 +10255,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cmp_eq_u32 s35, s36
; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB67_2
+; GCN2-NEXT: s_cbranch_vccz .LBB67_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v2, s34
+; GCN2-NEXT: v_mov_b32_e32 v3, s35
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB67_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB67_3
-; GCN2-NEXT: s_branch .LBB67_4
-; GCN2-NEXT: .LBB67_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB67_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB67_6
+; GCN2-NEXT: .LBB67_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB67_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -9089,7 +10300,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: v_or_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB67_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -9102,20 +10313,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cmp_eq_u32 s35, s37
; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB67_2
+; GCN3-NEXT: s_cbranch_vccz .LBB67_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB67_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_or_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_or_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB67_3
-; GCN3-NEXT: s_branch .LBB67_4
-; GCN3-NEXT: .LBB67_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB67_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB67_6
+; GCN3-NEXT: .LBB67_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB67_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB67_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -9127,7 +10351,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: v_or_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB67_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB67_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
@@ -9409,21 +10633,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB70_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB70_4
+; GCN1-NEXT: s_cbranch_execnz .LBB70_6
; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB70_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB70_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB70_2
-; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB70_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -9449,21 +10692,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB70_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB70_4
+; GCN2-NEXT: s_cbranch_execnz .LBB70_6
; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB70_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB70_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB70_2
-; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB70_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -9487,21 +10749,37 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB70_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB70_4
+; GCN3-NEXT: s_cbranch_execnz .LBB70_6
; GCN3-NEXT: .LBB70_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB70_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB70_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB70_2
-; GCN3-NEXT: .LBB70_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB70_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -9532,21 +10810,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB71_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB71_4
+; GCN1-NEXT: s_cbranch_execnz .LBB71_6
; GCN1-NEXT: .LBB71_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB71_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB71_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB71_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB71_2
-; GCN1-NEXT: .LBB71_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB71_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
@@ -9574,21 +10871,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB71_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB71_4
+; GCN2-NEXT: s_cbranch_execnz .LBB71_6
; GCN2-NEXT: .LBB71_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB71_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB71_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB71_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB71_2
-; GCN2-NEXT: .LBB71_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB71_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
@@ -9614,21 +10930,37 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB71_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB71_4
+; GCN3-NEXT: s_cbranch_execnz .LBB71_6
; GCN3-NEXT: .LBB71_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB71_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB71_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB71_2
-; GCN3-NEXT: .LBB71_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB71_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
@@ -9653,41 +10985,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB72_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB72_4
-; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_cbranch_execz .LBB72_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB72_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB72_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
+; GCN1-NEXT: .LBB72_4: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB72_2
-; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: s_cbranch_execz .LBB72_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2
+; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3
-; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3
+; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB72_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -9696,41 +11043,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB72_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB72_4
-; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_cbranch_execz .LBB72_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB72_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3
+; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB72_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
+; GCN2-NEXT: .LBB72_4: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB72_2
-; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: s_cbranch_execz .LBB72_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2
+; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3
-; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3
+; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB72_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -9745,21 +11107,37 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB72_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB72_4
+; GCN3-NEXT: s_cbranch_execnz .LBB72_6
; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB72_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB72_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB72_2
-; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB72_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -9791,21 +11169,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB73_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB73_4
+; GCN1-NEXT: s_cbranch_execnz .LBB73_6
; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB73_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3
+; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB73_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr3
+; GCN1-NEXT: ; implicit-def: $vgpr2
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB73_2
-; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB73_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -9834,21 +11231,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB73_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB73_4
+; GCN2-NEXT: s_cbranch_execnz .LBB73_6
; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB73_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3
+; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB73_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr3
+; GCN2-NEXT: ; implicit-def: $vgpr2
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB73_2
-; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB73_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -9875,21 +11291,37 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB73_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB73_4
+; GCN3-NEXT: s_cbranch_execnz .LBB73_6
; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB73_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3
+; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB73_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr3
+; GCN3-NEXT: ; implicit-def: $vgpr2
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB73_2
-; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB73_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4
@@ -9920,21 +11352,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB74_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB74_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB74_6
; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB74_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB74_2
-; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB74_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB74_2
+; GCN1-NEXT: .LBB74_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -9963,21 +11413,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB74_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB74_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB74_6
; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB74_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB74_2
-; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB74_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB74_2
+; GCN2-NEXT: .LBB74_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -10003,21 +11471,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB74_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB74_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB74_6
; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB74_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB74_2
-; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB74_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB74_2
+; GCN3-NEXT: .LBB74_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -10049,21 +11530,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN1-NEXT: s_mov_b64 s[36:37], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB75_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB75_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_vccnz .LBB75_6
; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v4, s34
+; GCN1-NEXT: v_mov_b32_e32 v5, s35
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB75_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB75_2
-; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB75_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB75_2
+; GCN1-NEXT: .LBB75_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -10094,21 +11593,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN2-NEXT: s_mov_b64 s[36:37], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB75_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB75_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_vccnz .LBB75_6
; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v4, s34
+; GCN2-NEXT: v_mov_b32_e32 v5, s35
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB75_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB75_2
-; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB75_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB75_2
+; GCN2-NEXT: .LBB75_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v0, s34
@@ -10136,21 +11653,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_mov_b64 s[36:37], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB75_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB75_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_vccnz .LBB75_6
; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB75_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3
+; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB75_2
-; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB75_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB75_2
+; GCN3-NEXT: .LBB75_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v0, s34
@@ -10179,20 +11709,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB76_2
+; GCN1-NEXT: s_cbranch_vccz .LBB76_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB76_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB76_3
-; GCN1-NEXT: s_branch .LBB76_4
-; GCN1-NEXT: .LBB76_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB76_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB76_6
+; GCN1-NEXT: .LBB76_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB76_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB76_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -10207,7 +11755,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB76_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB76_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -10220,20 +11768,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB76_2
+; GCN2-NEXT: s_cbranch_vccz .LBB76_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB76_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB76_3
-; GCN2-NEXT: s_branch .LBB76_4
-; GCN2-NEXT: .LBB76_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB76_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB76_6
+; GCN2-NEXT: .LBB76_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB76_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB76_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -10247,7 +11813,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB76_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB76_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -10258,20 +11824,33 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB76_2
+; GCN3-NEXT: s_cbranch_vccz .LBB76_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB76_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB76_3
-; GCN3-NEXT: s_branch .LBB76_4
-; GCN3-NEXT: .LBB76_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB76_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB76_6
+; GCN3-NEXT: .LBB76_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB76_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB76_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -10283,7 +11862,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB76_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB76_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst
@@ -10302,20 +11881,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: s_cmp_eq_u32 s35, s36
; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB77_2
+; GCN1-NEXT: s_cbranch_vccz .LBB77_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v2, s34
+; GCN1-NEXT: v_mov_b32_e32 v3, s35
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB77_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB77_3
-; GCN1-NEXT: s_branch .LBB77_4
-; GCN1-NEXT: .LBB77_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB77_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB77_6
+; GCN1-NEXT: .LBB77_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB77_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -10330,7 +11927,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB77_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -10345,20 +11942,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: s_cmp_eq_u32 s35, s36
; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB77_2
+; GCN2-NEXT: s_cbranch_vccz .LBB77_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v2, s34
+; GCN2-NEXT: v_mov_b32_e32 v3, s35
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB77_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB77_3
-; GCN2-NEXT: s_branch .LBB77_4
-; GCN2-NEXT: .LBB77_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB77_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB77_6
+; GCN2-NEXT: .LBB77_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB77_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -10372,7 +11987,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB77_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -10385,20 +12000,33 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: s_cmp_eq_u32 s35, s37
; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB77_2
+; GCN3-NEXT: s_cbranch_vccz .LBB77_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB77_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7
+; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB77_3
-; GCN3-NEXT: s_branch .LBB77_4
-; GCN3-NEXT: .LBB77_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB77_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB77_6
+; GCN3-NEXT: .LBB77_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB77_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB77_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -10410,7 +12038,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB77_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB77_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
@@ -13030,40 +14658,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB92_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB92_6
+; GCN1-NEXT: s_cbranch_execnz .LBB92_4
; GCN1-NEXT: .LBB92_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB92_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v7, v[4:5]
-; GCN1-NEXT: flat_load_dword v6, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB92_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v7, v5
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v6, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB92_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB92_2
-; GCN1-NEXT: .LBB92_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB92_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -13091,40 +14700,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB92_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB92_6
+; GCN2-NEXT: s_cbranch_execnz .LBB92_4
; GCN2-NEXT: .LBB92_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB92_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v7, v[4:5]
-; GCN2-NEXT: flat_load_dword v6, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB92_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v7, v5
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v6, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB92_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB92_2
-; GCN2-NEXT: .LBB92_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB92_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -13150,37 +14740,21 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB92_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB92_6
+; GCN3-NEXT: s_cbranch_execnz .LBB92_4
; GCN3-NEXT: .LBB92_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB92_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB92_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB92_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB92_2
-; GCN3-NEXT: .LBB92_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB92_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -13213,40 +14787,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB93_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB93_6
+; GCN1-NEXT: s_cbranch_execnz .LBB93_4
; GCN1-NEXT: .LBB93_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB93_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: flat_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB93_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v9, v1
-; GCN1-NEXT: v_mov_b32_e32 v8, v0
-; GCN1-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB93_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB93_2
-; GCN1-NEXT: .LBB93_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB93_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -13275,40 +14830,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB93_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB93_6
+; GCN2-NEXT: s_cbranch_execnz .LBB93_4
; GCN2-NEXT: .LBB93_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB93_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: flat_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB93_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v9, v1
-; GCN2-NEXT: v_mov_b32_e32 v8, v0
-; GCN2-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB93_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB93_2
-; GCN2-NEXT: .LBB93_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB93_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -13335,37 +14871,21 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB93_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB93_6
+; GCN3-NEXT: s_cbranch_execnz .LBB93_4
; GCN3-NEXT: .LBB93_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB93_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB93_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v9, v1
-; GCN3-NEXT: v_mov_b32_e32 v8, v0
-; GCN3-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN3-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB93_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB93_2
-; GCN3-NEXT: .LBB93_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB93_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -15528,40 +17048,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB105_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB105_6
+; GCN1-NEXT: s_cbranch_execnz .LBB105_4
; GCN1-NEXT: .LBB105_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB105_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v7, v[4:5]
-; GCN1-NEXT: flat_load_dword v6, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB105_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v7, v5
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v6, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB105_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB105_2
-; GCN1-NEXT: .LBB105_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB105_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -15589,40 +17090,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB105_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB105_6
+; GCN2-NEXT: s_cbranch_execnz .LBB105_4
; GCN2-NEXT: .LBB105_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB105_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v7, v[4:5]
-; GCN2-NEXT: flat_load_dword v6, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB105_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v7, v5
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v6, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB105_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB105_2
-; GCN2-NEXT: .LBB105_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB105_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -15648,37 +17130,21 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB105_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB105_6
+; GCN3-NEXT: s_cbranch_execnz .LBB105_4
; GCN3-NEXT: .LBB105_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB105_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB105_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB105_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB105_2
-; GCN3-NEXT: .LBB105_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB105_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -15711,40 +17177,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB106_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB106_6
+; GCN1-NEXT: s_cbranch_execnz .LBB106_4
; GCN1-NEXT: .LBB106_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB106_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: flat_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB106_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v9, v1
-; GCN1-NEXT: v_mov_b32_e32 v8, v0
-; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB106_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB106_2
-; GCN1-NEXT: .LBB106_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB106_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -15773,40 +17220,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB106_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB106_6
+; GCN2-NEXT: s_cbranch_execnz .LBB106_4
; GCN2-NEXT: .LBB106_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB106_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: flat_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB106_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v9, v1
-; GCN2-NEXT: v_mov_b32_e32 v8, v0
-; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB106_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB106_2
-; GCN2-NEXT: .LBB106_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB106_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -15833,37 +17261,21 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB106_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB106_6
+; GCN3-NEXT: s_cbranch_execnz .LBB106_4
; GCN3-NEXT: .LBB106_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB106_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB106_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v9, v1
-; GCN3-NEXT: v_mov_b32_e32 v8, v0
-; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN3-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB106_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB106_2
-; GCN3-NEXT: .LBB106_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB106_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -17383,40 +18795,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB115_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB115_6
+; GCN1-NEXT: s_cbranch_execnz .LBB115_4
; GCN1-NEXT: .LBB115_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB115_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v7, v[4:5]
-; GCN1-NEXT: flat_load_dword v6, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB115_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v7, v5
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v6, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB115_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB115_2
-; GCN1-NEXT: .LBB115_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -17444,40 +18837,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB115_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB115_6
+; GCN2-NEXT: s_cbranch_execnz .LBB115_4
; GCN2-NEXT: .LBB115_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB115_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v7, v[4:5]
-; GCN2-NEXT: flat_load_dword v6, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB115_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v7, v5
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v6, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB115_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB115_2
-; GCN2-NEXT: .LBB115_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB115_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -17503,37 +18877,21 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB115_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB115_6
+; GCN3-NEXT: s_cbranch_execnz .LBB115_4
; GCN3-NEXT: .LBB115_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB115_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB115_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB115_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB115_2
-; GCN3-NEXT: .LBB115_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB115_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -17566,40 +18924,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB116_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB116_6
+; GCN1-NEXT: s_cbranch_execnz .LBB116_4
; GCN1-NEXT: .LBB116_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB116_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: flat_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB116_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v9, v1
-; GCN1-NEXT: v_mov_b32_e32 v8, v0
-; GCN1-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB116_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB116_2
-; GCN1-NEXT: .LBB116_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB116_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -17628,40 +18967,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB116_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB116_6
+; GCN2-NEXT: s_cbranch_execnz .LBB116_4
; GCN2-NEXT: .LBB116_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB116_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: flat_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB116_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v9, v1
-; GCN2-NEXT: v_mov_b32_e32 v8, v0
-; GCN2-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB116_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB116_2
-; GCN2-NEXT: .LBB116_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB116_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -17688,37 +19008,21 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB116_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB116_6
+; GCN3-NEXT: s_cbranch_execnz .LBB116_4
; GCN3-NEXT: .LBB116_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB116_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB116_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v9, v1
-; GCN3-NEXT: v_mov_b32_e32 v8, v0
-; GCN3-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN3-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB116_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB116_2
-; GCN3-NEXT: .LBB116_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB116_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20077,40 +21381,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB129_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB129_6
+; GCN1-NEXT: s_cbranch_execnz .LBB129_4
; GCN1-NEXT: .LBB129_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB129_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: flat_load_dword v7, v[4:5]
-; GCN1-NEXT: flat_load_dword v6, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB129_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v7, v5
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: v_mov_b32_e32 v6, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB129_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB129_2
-; GCN1-NEXT: .LBB129_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB129_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -20138,40 +21423,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB129_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB129_6
+; GCN2-NEXT: s_cbranch_execnz .LBB129_4
; GCN2-NEXT: .LBB129_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB129_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: flat_load_dword v7, v[4:5]
-; GCN2-NEXT: flat_load_dword v6, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB129_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v7, v5
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: v_mov_b32_e32 v6, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB129_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB129_2
-; GCN2-NEXT: .LBB129_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB129_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -20197,37 +21463,21 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB129_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB129_6
+; GCN3-NEXT: s_cbranch_execnz .LBB129_4
; GCN3-NEXT: .LBB129_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB129_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB129_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GCN3-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v7, v5
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: v_mov_b32_e32 v6, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB129_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB129_2
-; GCN3-NEXT: .LBB129_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB129_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20260,40 +21510,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB130_3
-; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: ; %bb.1: ; %Flow
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB130_6
+; GCN1-NEXT: s_cbranch_execnz .LBB130_4
; GCN1-NEXT: .LBB130_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB130_3: ; %atomicrmw.global
-; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: flat_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[6:7], 0
-; GCN1-NEXT: .LBB130_4: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v9, v1
-; GCN1-NEXT: v_mov_b32_e32 v8, v0
-; GCN1-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; GCN1-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN1-NEXT: s_cbranch_execnz .LBB130_4
-; GCN1-NEXT: ; %bb.5: ; %Flow
-; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB130_2
-; GCN1-NEXT: .LBB130_6: ; %atomicrmw.private
+; GCN1-NEXT: .LBB130_4: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -20322,40 +21553,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB130_3
-; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: ; %bb.1: ; %Flow
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB130_6
+; GCN2-NEXT: s_cbranch_execnz .LBB130_4
; GCN2-NEXT: .LBB130_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB130_3: ; %atomicrmw.global
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
-; GCN2-NEXT: flat_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[6:7], 0
-; GCN2-NEXT: .LBB130_4: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v9, v1
-; GCN2-NEXT: v_mov_b32_e32 v8, v0
-; GCN2-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; GCN2-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN2-NEXT: s_cbranch_execnz .LBB130_4
-; GCN2-NEXT: ; %bb.5: ; %Flow
-; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB130_2
-; GCN2-NEXT: .LBB130_6: ; %atomicrmw.private
+; GCN2-NEXT: .LBB130_4: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -20382,37 +21594,21 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB130_3
-; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: ; %bb.1: ; %Flow
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB130_6
+; GCN3-NEXT: s_cbranch_execnz .LBB130_4
; GCN3-NEXT: .LBB130_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB130_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; GCN3-NEXT: s_mov_b64 s[6:7], 0
-; GCN3-NEXT: .LBB130_4: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v9, v1
-; GCN3-NEXT: v_mov_b32_e32 v8, v0
-; GCN3-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; GCN3-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GCN3-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GCN3-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GCN3-NEXT: s_cbranch_execnz .LBB130_4
-; GCN3-NEXT: ; %bb.5: ; %Flow
-; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB130_2
-; GCN3-NEXT: .LBB130_6: ; %atomicrmw.private
+; GCN3-NEXT: .LBB130_4: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20446,21 +21642,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB131_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB131_4
+; GCN1-NEXT: s_cbranch_execnz .LBB131_6
; GCN1-NEXT: .LBB131_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB131_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB131_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB131_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB131_2
-; GCN1-NEXT: .LBB131_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB131_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20489,21 +21706,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB131_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB131_4
+; GCN2-NEXT: s_cbranch_execnz .LBB131_6
; GCN2-NEXT: .LBB131_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB131_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB131_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB131_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB131_2
-; GCN2-NEXT: .LBB131_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB131_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20530,21 +21768,39 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB131_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB131_4
+; GCN3-NEXT: s_cbranch_execnz .LBB131_6
; GCN3-NEXT: .LBB131_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB131_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB131_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB131_2
-; GCN3-NEXT: .LBB131_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB131_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20578,21 +21834,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB132_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB132_4
+; GCN1-NEXT: s_cbranch_execnz .LBB132_6
; GCN1-NEXT: .LBB132_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB132_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB132_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB132_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB132_2
-; GCN1-NEXT: .LBB132_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB132_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20623,21 +21900,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB132_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB132_4
+; GCN2-NEXT: s_cbranch_execnz .LBB132_6
; GCN2-NEXT: .LBB132_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB132_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB132_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB132_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB132_2
-; GCN2-NEXT: .LBB132_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB132_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20666,21 +21964,39 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB132_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB132_4
+; GCN3-NEXT: s_cbranch_execnz .LBB132_6
; GCN3-NEXT: .LBB132_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB132_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB132_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB132_2
-; GCN3-NEXT: .LBB132_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB132_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20708,44 +22024,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB133_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB133_4
-; GCN1-NEXT: .LBB133_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB133_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_cbranch_execz .LBB133_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB133_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GCN1-NEXT: .LBB133_4: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB133_2
-; GCN1-NEXT: .LBB133_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: s_cbranch_execz .LBB133_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0
+; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v4
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3]
; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
-; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB133_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -20754,44 +22087,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB133_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB133_4
-; GCN2-NEXT: .LBB133_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB133_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_cbranch_execz .LBB133_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB133_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GCN2-NEXT: .LBB133_4: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB133_2
-; GCN2-NEXT: .LBB133_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: s_cbranch_execz .LBB133_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0
+; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v4
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc
-; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB133_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -20806,21 +22156,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB133_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB133_4
+; GCN3-NEXT: s_cbranch_execnz .LBB133_6
; GCN3-NEXT: .LBB133_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB133_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB133_2
-; GCN3-NEXT: .LBB133_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB133_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20855,21 +22223,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB134_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: ; %bb.1: ; %Flow3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB134_4
+; GCN1-NEXT: s_cbranch_execnz .LBB134_6
; GCN1-NEXT: .LBB134_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB134_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[6:7], 0
+; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN1-NEXT: s_cbranch_execnz .LBB134_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN1-NEXT: s_cbranch_execz .LBB134_2
-; GCN1-NEXT: .LBB134_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB134_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20901,21 +22290,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB134_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: ; %bb.1: ; %Flow3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB134_4
+; GCN2-NEXT: s_cbranch_execnz .LBB134_6
; GCN2-NEXT: .LBB134_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB134_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[6:7], 0
+; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN2-NEXT: s_cbranch_execnz .LBB134_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN2-NEXT: s_cbranch_execz .LBB134_2
-; GCN2-NEXT: .LBB134_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB134_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20945,21 +22355,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB134_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
+; GCN3-NEXT: ; %bb.1: ; %Flow3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB134_4
+; GCN3-NEXT: s_cbranch_execnz .LBB134_6
; GCN3-NEXT: .LBB134_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[6:7], 0
+; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN3-NEXT: s_cbranch_execnz .LBB134_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[6:7]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN3-NEXT: s_cbranch_execz .LBB134_2
-; GCN3-NEXT: .LBB134_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB134_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -20993,21 +22421,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB135_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB135_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB135_6
; GCN1-NEXT: .LBB135_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB135_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB135_2
-; GCN1-NEXT: .LBB135_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB135_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB135_2
+; GCN1-NEXT: .LBB135_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -21039,21 +22488,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB135_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB135_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB135_6
; GCN2-NEXT: .LBB135_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB135_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB135_2
-; GCN2-NEXT: .LBB135_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB135_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB135_2
+; GCN2-NEXT: .LBB135_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -21082,21 +22552,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB135_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB135_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB135_6
; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB135_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB135_2
-; GCN3-NEXT: .LBB135_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB135_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB135_2
+; GCN3-NEXT: .LBB135_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -21131,21 +22617,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN1-NEXT: s_mov_b64 s[36:37], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB136_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB136_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_vccnz .LBB136_6
; GCN1-NEXT: .LBB136_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v4, s34
+; GCN1-NEXT: v_mov_b32_e32 v5, s35
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB136_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB136_2
-; GCN1-NEXT: .LBB136_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB136_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB136_2
+; GCN1-NEXT: .LBB136_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -21179,21 +22686,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN2-NEXT: s_mov_b64 s[36:37], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB136_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB136_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_vccnz .LBB136_6
; GCN2-NEXT: .LBB136_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v4, s34
+; GCN2-NEXT: v_mov_b32_e32 v5, s35
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB136_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB136_2
-; GCN2-NEXT: .LBB136_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB136_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB136_2
+; GCN2-NEXT: .LBB136_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -21224,21 +22752,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
; GCN3-NEXT: s_mov_b64 s[36:37], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB136_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB136_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_vccnz .LBB136_6
; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s34
+; GCN3-NEXT: v_mov_b32_e32 v5, s35
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB136_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB136_2
-; GCN3-NEXT: .LBB136_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB136_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB136_2
+; GCN3-NEXT: .LBB136_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -21270,20 +22814,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB137_2
+; GCN1-NEXT: s_cbranch_vccz .LBB137_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[34:35], 0
+; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB137_3
-; GCN1-NEXT: s_branch .LBB137_4
-; GCN1-NEXT: .LBB137_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_execnz .LBB137_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN1-NEXT: s_branch .LBB137_6
+; GCN1-NEXT: .LBB137_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB137_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB137_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
; GCN1-NEXT: s_cselect_b32 s34, s4, -1
@@ -21301,7 +22866,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB137_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB137_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -21314,20 +22879,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB137_2
+; GCN2-NEXT: s_cbranch_vccz .LBB137_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[34:35], 0
+; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB137_3
-; GCN2-NEXT: s_branch .LBB137_4
-; GCN2-NEXT: .LBB137_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_execnz .LBB137_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN2-NEXT: s_branch .LBB137_6
+; GCN2-NEXT: .LBB137_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB137_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB137_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -21344,7 +22930,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB137_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB137_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -21355,20 +22941,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB137_2
+; GCN3-NEXT: s_cbranch_vccz .LBB137_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[34:35], 0
+; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB137_3
-; GCN3-NEXT: s_branch .LBB137_4
-; GCN3-NEXT: .LBB137_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_execnz .LBB137_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
+; GCN3-NEXT: s_branch .LBB137_6
+; GCN3-NEXT: .LBB137_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB137_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB137_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -21383,7 +22985,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB137_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB137_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
@@ -21402,20 +23004,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: s_cmp_eq_u32 s35, s36
; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB138_2
+; GCN1-NEXT: s_cbranch_vccz .LBB138_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s36, s34, 4
+; GCN1-NEXT: s_addc_u32 s37, s35, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s36
+; GCN1-NEXT: v_mov_b32_e32 v1, s37
+; GCN1-NEXT: v_mov_b32_e32 v2, s34
+; GCN1-NEXT: v_mov_b32_e32 v3, s35
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v1
+; GCN1-NEXT: v_mov_b32_e32 v6, v0
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB138_3
-; GCN1-NEXT: s_branch .LBB138_4
-; GCN1-NEXT: .LBB138_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_cbranch_execnz .LBB138_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN1-NEXT: s_branch .LBB138_6
+; GCN1-NEXT: .LBB138_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB138_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB138_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
; GCN1-NEXT: s_cselect_b32 s34, s34, -1
@@ -21433,7 +23056,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB138_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB138_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -21448,20 +23071,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: s_cmp_eq_u32 s35, s36
; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB138_2
+; GCN2-NEXT: s_cbranch_vccz .LBB138_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s36, s34, 4
+; GCN2-NEXT: s_addc_u32 s37, s35, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s36
+; GCN2-NEXT: v_mov_b32_e32 v1, s37
+; GCN2-NEXT: v_mov_b32_e32 v2, s34
+; GCN2-NEXT: v_mov_b32_e32 v3, s35
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[36:37], 0
+; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v1
+; GCN2-NEXT: v_mov_b32_e32 v6, v0
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB138_3
-; GCN2-NEXT: s_branch .LBB138_4
-; GCN2-NEXT: .LBB138_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_cbranch_execnz .LBB138_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN2-NEXT: s_branch .LBB138_6
+; GCN2-NEXT: .LBB138_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB138_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB138_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN2-NEXT: s_cselect_b32 s34, s34, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -21478,7 +23122,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB138_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB138_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -21491,20 +23135,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: s_cmp_eq_u32 s35, s37
; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB138_2
+; GCN3-NEXT: s_cbranch_vccz .LBB138_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s34
+; GCN3-NEXT: v_mov_b32_e32 v3, s35
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[36:37], 0
+; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v7, v1
+; GCN3-NEXT: v_mov_b32_e32 v6, v0
+; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
+; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB138_3
-; GCN3-NEXT: s_branch .LBB138_4
-; GCN3-NEXT: .LBB138_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_cbranch_execnz .LBB138_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[36:37]
+; GCN3-NEXT: s_branch .LBB138_6
+; GCN3-NEXT: .LBB138_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB138_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB138_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
; GCN3-NEXT: s_cselect_b32 s34, s34, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -21519,7 +23179,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB138_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB138_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
@@ -21817,23 +23477,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB141_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB141_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN1-NEXT: s_cbranch_execnz .LBB141_6
; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[10:11], 0
+; GCN1-NEXT: .LBB141_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
+; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN1-NEXT: s_cbranch_execnz .LBB141_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN1-NEXT: s_cbranch_execz .LBB141_2
-; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB141_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -21861,23 +23544,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB141_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB141_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN2-NEXT: s_cbranch_execnz .LBB141_6
; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[10:11], 0
+; GCN2-NEXT: .LBB141_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN2-NEXT: s_cbranch_execnz .LBB141_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN2-NEXT: s_cbranch_execz .LBB141_2
-; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB141_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -21903,23 +23609,43 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB141_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB141_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN3-NEXT: s_cbranch_execnz .LBB141_6
; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[10:11], 0
+; GCN3-NEXT: .LBB141_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN3-NEXT: s_cbranch_execnz .LBB141_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN3-NEXT: s_cbranch_execz .LBB141_2
-; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB141_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -21952,23 +23678,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB142_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB142_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN1-NEXT: s_cbranch_execnz .LBB142_6
; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v7, v[4:5]
+; GCN1-NEXT: flat_load_dword v6, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[10:11], 0
+; GCN1-NEXT: .LBB142_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
+; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN1-NEXT: s_cbranch_execnz .LBB142_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN1-NEXT: s_cbranch_execz .LBB142_2
-; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB142_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -21998,23 +23747,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB142_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB142_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN2-NEXT: s_cbranch_execnz .LBB142_6
; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v7, v[4:5]
+; GCN2-NEXT: flat_load_dword v6, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[10:11], 0
+; GCN2-NEXT: .LBB142_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN2-NEXT: s_cbranch_execnz .LBB142_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN2-NEXT: s_cbranch_execz .LBB142_2
-; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB142_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -22042,23 +23814,43 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base
; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB142_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB142_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN3-NEXT: s_cbranch_execnz .LBB142_6
; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GCN3-NEXT: s_mov_b64 s[10:11], 0
+; GCN3-NEXT: .LBB142_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN3-NEXT: v_mov_b32_e32 v7, v5
+; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN3-NEXT: v_mov_b32_e32 v6, v4
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN3-NEXT: s_cbranch_execnz .LBB142_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN3-NEXT: s_cbranch_execz .LBB142_2
-; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB142_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -22087,46 +23879,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN1-NEXT: v_mov_b32_e32 v5, v1
-; GCN1-NEXT: v_mov_b32_e32 v4, v0
-; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB143_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB143_4
-; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi
-; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN1-NEXT: s_cbranch_execz .LBB143_4
+; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN1-NEXT: flat_load_dword v5, v[4:5]
+; GCN1-NEXT: flat_load_dword v4, v[0:1]
+; GCN1-NEXT: s_mov_b64 s[10:11], 0
+; GCN1-NEXT: .LBB143_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v7, v5
+; GCN1-NEXT: v_mov_b32_e32 v6, v4
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
+; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN1-NEXT: s_cbranch_execnz .LBB143_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[10:11]
+; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN1-NEXT: s_cbranch_execz .LBB143_2
-; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
-; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN1-NEXT: .LBB143_4: ; %Flow3
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN1-NEXT: s_cbranch_execz .LBB143_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0
+; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN1-NEXT: s_waitcnt vmcnt(1)
-; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0
+; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v4
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
-; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3]
+; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7]
; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN1-NEXT: .LBB143_6: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN1-NEXT: v_mov_b32_e32 v0, v4
+; GCN1-NEXT: v_mov_b32_e32 v1, v5
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -22135,46 +23946,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4
; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0
-; GCN2-NEXT: v_mov_b32_e32 v5, v1
-; GCN2-NEXT: v_mov_b32_e32 v4, v0
-; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
+; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB143_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB143_4
-; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi
-; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; GCN2-NEXT: s_cbranch_execz .LBB143_4
+; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN2-NEXT: flat_load_dword v5, v[4:5]
+; GCN2-NEXT: flat_load_dword v4, v[0:1]
+; GCN2-NEXT: s_mov_b64 s[10:11], 0
+; GCN2-NEXT: .LBB143_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v7, v5
+; GCN2-NEXT: v_mov_b32_e32 v6, v4
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN2-NEXT: s_cbranch_execnz .LBB143_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[10:11]
+; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN2-NEXT: s_cbranch_execz .LBB143_2
-; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private
-; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
-; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen
+; GCN2-NEXT: .LBB143_4: ; %Flow3
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN2-NEXT: s_cbranch_execz .LBB143_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0
+; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
; GCN2-NEXT: s_waitcnt vmcnt(1)
-; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0
+; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v4
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3]
-; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7]
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3]
+; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7]
; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
-; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN2-NEXT: .LBB143_6: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN2-NEXT: v_mov_b32_e32 v0, v4
+; GCN2-NEXT: v_mov_b32_e32 v1, v5
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -22187,23 +24017,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB143_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB143_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN3-NEXT: s_cbranch_execnz .LBB143_6
; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[10:11], 0
+; GCN3-NEXT: .LBB143_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN3-NEXT: s_cbranch_execnz .LBB143_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN3-NEXT: s_cbranch_execz .LBB143_2
-; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB143_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -22238,23 +24088,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN1-NEXT: s_cbranch_execnz .LBB144_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB144_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN1-NEXT: s_cbranch_execnz .LBB144_6
; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi
; GCN1-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4
+; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[10:11], 0
+; GCN1-NEXT: .LBB144_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8
+; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN1-NEXT: s_cbranch_execnz .LBB144_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN1-NEXT: s_cbranch_execz .LBB144_2
-; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private
+; GCN1-NEXT: .LBB144_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4
@@ -22286,23 +24159,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN2-NEXT: s_cbranch_execnz .LBB144_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB144_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN2-NEXT: s_cbranch_execnz .LBB144_6
; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi
; GCN2-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4
+; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[10:11], 0
+; GCN2-NEXT: .LBB144_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8
+; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN2-NEXT: s_cbranch_execnz .LBB144_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN2-NEXT: s_cbranch_execz .LBB144_2
-; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private
+; GCN2-NEXT: .LBB144_6: ; %atomicrmw.private
; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4
@@ -22332,23 +24228,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GCN3-NEXT: s_cbranch_execnz .LBB144_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB144_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GCN3-NEXT: s_cbranch_execnz .LBB144_6
; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi
; GCN3-NEXT: s_or_b64 exec, exec, s[8:9]
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[10:11], 0
+; GCN3-NEXT: .LBB144_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GCN3-NEXT: s_cbranch_execnz .LBB144_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[10:11]
; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5
; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9]
; GCN3-NEXT: s_cbranch_execz .LBB144_2
-; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private
+; GCN3-NEXT: .LBB144_6: ; %atomicrmw.private
; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen
@@ -22384,21 +24300,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB145_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB145_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB145_6
; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v4, s4
+; GCN1-NEXT: v_mov_b32_e32 v5, s5
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[38:39], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
+; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
+; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB145_2
-; GCN1-NEXT: .LBB145_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN1-NEXT: s_cbranch_execnz .LBB145_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN1-NEXT: s_branch .LBB145_2
+; GCN1-NEXT: .LBB145_6: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
@@ -22433,21 +24374,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB145_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB145_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB145_6
; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[38:39], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
+; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB145_2
-; GCN2-NEXT: .LBB145_4: ; %atomicrmw.private
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN2-NEXT: s_cbranch_execnz .LBB145_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN2-NEXT: s_branch .LBB145_2
+; GCN2-NEXT: .LBB145_6: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -22479,21 +24445,41 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB145_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB145_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB145_6
; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[38:39], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
+; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB145_2
-; GCN3-NEXT: .LBB145_4: ; %atomicrmw.private
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN3-NEXT: s_cbranch_execnz .LBB145_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN3-NEXT: s_branch .LBB145_2
+; GCN3-NEXT: .LBB145_6: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -22522,34 +24508,59 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
-; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
-; GCN1-NEXT: s_add_u32 s34, s4, 32
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
+; GCN1-NEXT: s_add_u32 s38, s4, 32
+; GCN1-NEXT: s_addc_u32 s39, s5, 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: s_cmp_eq_u32 s35, s36
-; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_mov_b64 s[36:37], -1
+; GCN1-NEXT: s_cmp_eq_u32 s39, s34
+; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_mov_b64 s[34:35], -1
; GCN1-NEXT: s_cbranch_vccnz .LBB146_3
-; GCN1-NEXT: ; %bb.1: ; %Flow
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB146_4
+; GCN1-NEXT: ; %bb.1: ; %Flow3
+; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccnz .LBB146_6
; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi
; GCN1-NEXT: s_setpc_b64 s[30:31]
; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global
+; GCN1-NEXT: s_add_u32 s34, s38, 4
+; GCN1-NEXT: s_addc_u32 s35, s39, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v4, s38
+; GCN1-NEXT: v_mov_b32_e32 v5, s39
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
+; GCN1-NEXT: flat_load_dword v2, v[4:5]
+; GCN1-NEXT: s_mov_b64 s[40:41], 0
+; GCN1-NEXT: v_mov_b32_e32 v6, s7
+; GCN1-NEXT: v_mov_b32_e32 v7, s6
+; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
+; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execnz .LBB146_2
-; GCN1-NEXT: .LBB146_4: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN1-NEXT: v_mov_b32_e32 v3, v1
+; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN1-NEXT: v_mov_b32_e32 v2, v0
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN1-NEXT: s_cbranch_execnz .LBB146_4
+; GCN1-NEXT: ; %bb.5: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN1-NEXT: s_branch .LBB146_2
+; GCN1-NEXT: .LBB146_6: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0
; GCN1-NEXT: v_mov_b32_e32 v5, s6
-; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
-; GCN1-NEXT: s_cselect_b32 s34, s34, -1
+; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT: s_cselect_b32 s34, s38, -1
; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_add_i32 s34, s34, 4
; GCN1-NEXT: v_mov_b32_e32 v3, s34
@@ -22573,32 +24584,57 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
-; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
-; GCN2-NEXT: s_add_u32 s34, s4, 32
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
+; GCN2-NEXT: s_add_u32 s38, s4, 32
+; GCN2-NEXT: s_addc_u32 s39, s5, 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_cmp_eq_u32 s35, s36
-; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_mov_b64 s[36:37], -1
+; GCN2-NEXT: s_cmp_eq_u32 s39, s34
+; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_mov_b64 s[34:35], -1
; GCN2-NEXT: s_cbranch_vccnz .LBB146_3
-; GCN2-NEXT: ; %bb.1: ; %Flow
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB146_4
+; GCN2-NEXT: ; %bb.1: ; %Flow3
+; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccnz .LBB146_6
; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi
; GCN2-NEXT: s_setpc_b64 s[30:31]
; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global
+; GCN2-NEXT: s_add_u32 s34, s38, 4
+; GCN2-NEXT: s_addc_u32 s35, s39, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v4, s38
+; GCN2-NEXT: v_mov_b32_e32 v5, s39
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
+; GCN2-NEXT: flat_load_dword v2, v[4:5]
+; GCN2-NEXT: s_mov_b64 s[40:41], 0
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
+; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execnz .LBB146_2
-; GCN2-NEXT: .LBB146_4: ; %atomicrmw.private
-; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
-; GCN2-NEXT: s_cselect_b32 s34, s34, -1
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN2-NEXT: v_mov_b32_e32 v3, v1
+; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN2-NEXT: v_mov_b32_e32 v2, v0
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN2-NEXT: s_cbranch_execnz .LBB146_4
+; GCN2-NEXT: ; %bb.5: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN2-NEXT: s_branch .LBB146_2
+; GCN2-NEXT: .LBB146_6: ; %atomicrmw.private
+; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0
+; GCN2-NEXT: s_cselect_b32 s34, s38, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_add_i32 s34, s34, 4
; GCN2-NEXT: v_mov_b32_e32 v3, s34
@@ -22622,31 +24658,51 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: s_add_u32 s34, s4, 32
-; GCN3-NEXT: s_addc_u32 s35, s5, 0
-; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
-; GCN3-NEXT: s_cmp_eq_u32 s35, s37
-; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_mov_b64 s[36:37], -1
+; GCN3-NEXT: s_add_u32 s38, s4, 32
+; GCN3-NEXT: s_addc_u32 s39, s5, 0
+; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT: s_cmp_eq_u32 s39, s35
+; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_mov_b64 s[34:35], -1
; GCN3-NEXT: s_cbranch_vccnz .LBB146_3
-; GCN3-NEXT: ; %bb.1: ; %Flow
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB146_4
+; GCN3-NEXT: ; %bb.1: ; %Flow3
+; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccnz .LBB146_6
; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi
; GCN3-NEXT: s_setpc_b64 s[30:31]
; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v4, s38
+; GCN3-NEXT: v_mov_b32_e32 v5, s39
+; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GCN3-NEXT: s_mov_b64 s[40:41], 0
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
+; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execnz .LBB146_2
-; GCN3-NEXT: .LBB146_4: ; %atomicrmw.private
-; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
-; GCN3-NEXT: s_cselect_b32 s34, s34, -1
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GCN3-NEXT: v_mov_b32_e32 v3, v1
+; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN3-NEXT: v_mov_b32_e32 v2, v0
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN3-NEXT: s_cbranch_execnz .LBB146_4
+; GCN3-NEXT: ; %bb.5: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN3-NEXT: s_branch .LBB146_2
+; GCN3-NEXT: .LBB146_6: ; %atomicrmw.private
+; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0
+; GCN3-NEXT: s_cselect_b32 s34, s38, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
@@ -22679,20 +24735,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: s_cmp_eq_u32 s5, s34
; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_vccz .LBB147_2
+; GCN1-NEXT: s_cbranch_vccz .LBB147_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_add_u32 s34, s4, 4
+; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: v_mov_b32_e32 v0, s34
+; GCN1-NEXT: v_mov_b32_e32 v1, s35
+; GCN1-NEXT: v_mov_b32_e32 v2, s4
+; GCN1-NEXT: v_mov_b32_e32 v3, s5
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[38:39], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
+; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
+; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB147_3
-; GCN1-NEXT: s_branch .LBB147_4
-; GCN1-NEXT: .LBB147_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN1-NEXT: s_cbranch_execnz .LBB147_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN1-NEXT: s_branch .LBB147_6
+; GCN1-NEXT: .LBB147_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB147_3: ; %atomicrmw.private
+; GCN1-NEXT: s_cbranch_execz .LBB147_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0
; GCN1-NEXT: v_mov_b32_e32 v5, s6
; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
@@ -22714,7 +24795,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB147_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB147_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -22727,20 +24808,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: s_cmp_eq_u32 s5, s34
; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_vccz .LBB147_2
+; GCN2-NEXT: s_cbranch_vccz .LBB147_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_add_u32 s34, s4, 4
+; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s34
+; GCN2-NEXT: v_mov_b32_e32 v1, s35
+; GCN2-NEXT: v_mov_b32_e32 v2, s4
+; GCN2-NEXT: v_mov_b32_e32 v3, s5
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[38:39], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
+; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB147_3
-; GCN2-NEXT: s_branch .LBB147_4
-; GCN2-NEXT: .LBB147_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN2-NEXT: s_cbranch_execnz .LBB147_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN2-NEXT: s_branch .LBB147_6
+; GCN2-NEXT: .LBB147_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB147_3: ; %atomicrmw.private
+; GCN2-NEXT: s_cbranch_execz .LBB147_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN2-NEXT: s_cselect_b32 s34, s4, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
@@ -22761,7 +24867,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB147_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB147_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
@@ -22772,20 +24878,40 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: s_cmp_eq_u32 s5, s35
; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_vccz .LBB147_2
+; GCN3-NEXT: s_cbranch_vccz .LBB147_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s4
+; GCN3-NEXT: v_mov_b32_e32 v3, s5
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[38:39], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
+; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB147_3
-; GCN3-NEXT: s_branch .LBB147_4
-; GCN3-NEXT: .LBB147_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GCN3-NEXT: s_cbranch_execnz .LBB147_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[38:39]
+; GCN3-NEXT: s_branch .LBB147_6
+; GCN3-NEXT: .LBB147_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB147_3: ; %atomicrmw.private
+; GCN3-NEXT: s_cbranch_execz .LBB147_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0
; GCN3-NEXT: s_cselect_b32 s34, s4, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
@@ -22804,7 +24930,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB147_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB147_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
@@ -22816,31 +24942,56 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1: ; %bb.0:
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4
-; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0
-; GCN1-NEXT: s_add_u32 s34, s4, 32
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
+; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0
+; GCN1-NEXT: s_add_u32 s38, s4, 32
+; GCN1-NEXT: s_addc_u32 s39, s5, 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
-; GCN1-NEXT: s_cmp_eq_u32 s35, s36
-; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_vccz .LBB148_2
+; GCN1-NEXT: s_cmp_eq_u32 s39, s34
+; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN1-NEXT: s_cbranch_vccz .LBB148_4
; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN1-NEXT: s_add_u32 s34, s38, 4
+; GCN1-NEXT: s_addc_u32 s35, s39, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v2, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: v_mov_b32_e32 v3, s7
-; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: v_mov_b32_e32 v2, s38
+; GCN1-NEXT: v_mov_b32_e32 v3, s39
+; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v0, v[2:3]
+; GCN1-NEXT: s_mov_b64 s[40:41], 0
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
+; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start
+; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v9, v1
+; GCN1-NEXT: v_mov_b32_e32 v8, v0
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
+; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: s_cbranch_execz .LBB148_3
-; GCN1-NEXT: s_branch .LBB148_4
-; GCN1-NEXT: .LBB148_2:
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN1-NEXT: s_cbranch_execnz .LBB148_2
+; GCN1-NEXT: ; %bb.3: ; %Flow
+; GCN1-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN1-NEXT: s_branch .LBB148_6
+; GCN1-NEXT: .LBB148_4:
; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN1-NEXT: .LBB148_3: ; %atomicrmw.private
-; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0
+; GCN1-NEXT: s_cbranch_execz .LBB148_6
+; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0
; GCN1-NEXT: v_mov_b32_e32 v5, s6
-; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec
-; GCN1-NEXT: s_cselect_b32 s34, s34, -1
+; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec
+; GCN1-NEXT: s_cselect_b32 s34, s38, -1
; GCN1-NEXT: v_mov_b32_e32 v2, s34
; GCN1-NEXT: s_add_i32 s34, s34, 4
; GCN1-NEXT: v_mov_b32_e32 v3, s34
@@ -22858,7 +25009,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN1-NEXT: .LBB148_4: ; %atomicrmw.end
+; GCN1-NEXT: .LBB148_6: ; %atomicrmw.phi
; GCN1-NEXT: s_waitcnt vmcnt(0)
; GCN1-NEXT: s_setpc_b64 s[30:31]
;
@@ -22866,29 +25017,54 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2: ; %bb.0:
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4
-; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0
-; GCN2-NEXT: s_add_u32 s34, s4, 32
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
+; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0
+; GCN2-NEXT: s_add_u32 s38, s4, 32
+; GCN2-NEXT: s_addc_u32 s39, s5, 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_cmp_eq_u32 s35, s36
-; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_vccz .LBB148_2
+; GCN2-NEXT: s_cmp_eq_u32 s39, s34
+; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN2-NEXT: s_cbranch_vccz .LBB148_4
; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global
+; GCN2-NEXT: s_add_u32 s34, s38, 4
+; GCN2-NEXT: s_addc_u32 s35, s39, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
-; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: v_mov_b32_e32 v2, s38
+; GCN2-NEXT: v_mov_b32_e32 v3, s39
+; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v0, v[2:3]
+; GCN2-NEXT: s_mov_b64 s[40:41], 0
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
+; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start
+; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v9, v1
+; GCN2-NEXT: v_mov_b32_e32 v8, v0
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: s_cbranch_execz .LBB148_3
-; GCN2-NEXT: s_branch .LBB148_4
-; GCN2-NEXT: .LBB148_2:
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN2-NEXT: s_cbranch_execnz .LBB148_2
+; GCN2-NEXT: ; %bb.3: ; %Flow
+; GCN2-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN2-NEXT: s_branch .LBB148_6
+; GCN2-NEXT: .LBB148_4:
; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN2-NEXT: .LBB148_3: ; %atomicrmw.private
-; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0
-; GCN2-NEXT: s_cselect_b32 s34, s34, -1
+; GCN2-NEXT: s_cbranch_execz .LBB148_6
+; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0
+; GCN2-NEXT: s_cselect_b32 s34, s38, -1
; GCN2-NEXT: v_mov_b32_e32 v2, s34
; GCN2-NEXT: s_add_i32 s34, s34, 4
; GCN2-NEXT: v_mov_b32_e32 v3, s34
@@ -22907,35 +25083,55 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
-; GCN2-NEXT: .LBB148_4: ; %atomicrmw.end
+; GCN2-NEXT: .LBB148_6: ; %atomicrmw.phi
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: s_setpc_b64 s[30:31]
;
; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: s_add_u32 s34, s4, 32
-; GCN3-NEXT: s_addc_u32 s35, s5, 0
-; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base
-; GCN3-NEXT: s_cmp_eq_u32 s35, s37
-; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0
-; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37]
-; GCN3-NEXT: s_cbranch_vccz .LBB148_2
+; GCN3-NEXT: s_add_u32 s38, s4, 32
+; GCN3-NEXT: s_addc_u32 s39, s5, 0
+; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base
+; GCN3-NEXT: s_cmp_eq_u32 s39, s35
+; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0
+; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35]
+; GCN3-NEXT: s_cbranch_vccz .LBB148_4
; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global
-; GCN3-NEXT: v_mov_b32_e32 v0, s34
-; GCN3-NEXT: v_mov_b32_e32 v2, s6
-; GCN3-NEXT: v_mov_b32_e32 v1, s35
-; GCN3-NEXT: v_mov_b32_e32 v3, s7
-; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN3-NEXT: v_mov_b32_e32 v2, s38
+; GCN3-NEXT: v_mov_b32_e32 v3, s39
+; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; GCN3-NEXT: s_mov_b64 s[40:41], 0
+; GCN3-NEXT: v_mov_b32_e32 v4, s7
+; GCN3-NEXT: v_mov_b32_e32 v5, s6
+; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start
+; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN3-NEXT: v_mov_b32_e32 v9, v1
+; GCN3-NEXT: v_mov_b32_e32 v8, v0
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
+; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: s_cbranch_execz .LBB148_3
-; GCN3-NEXT: s_branch .LBB148_4
-; GCN3-NEXT: .LBB148_2:
+; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41]
+; GCN3-NEXT: s_cbranch_execnz .LBB148_2
+; GCN3-NEXT: ; %bb.3: ; %Flow
+; GCN3-NEXT: s_or_b64 exec, exec, s[40:41]
+; GCN3-NEXT: s_branch .LBB148_6
+; GCN3-NEXT: .LBB148_4:
; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GCN3-NEXT: .LBB148_3: ; %atomicrmw.private
-; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0
-; GCN3-NEXT: s_cselect_b32 s34, s34, -1
+; GCN3-NEXT: s_cbranch_execz .LBB148_6
+; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private
+; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0
+; GCN3-NEXT: s_cselect_b32 s34, s38, -1
; GCN3-NEXT: v_mov_b32_e32 v2, s34
; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4
@@ -22952,7 +25148,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen
; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
-; GCN3-NEXT: .LBB148_4: ; %atomicrmw.end
+; GCN3-NEXT: .LBB148_6: ; %atomicrmw.phi
; GCN3-NEXT: s_waitcnt vmcnt(0)
; GCN3-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index fe47461..4dea449 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -1097,25 +1097,76 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_sub_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB30_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB30_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB30_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -1125,29 +1176,80 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_sub_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB31_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB31_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB31_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1158,25 +1260,82 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_sub_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB32_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB32_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB32_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -1186,29 +1345,82 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_sub_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2
+; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB33_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB33_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB33_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1219,37 +1431,95 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB34_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB34_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB34_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -1261,13 +1531,32 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB35_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
@@ -1275,25 +1564,60 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB35_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB35_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1304,37 +1628,95 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v1
+; GFX7-NEXT: v_mov_b32_e32 v7, v0
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB36_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB36_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB36_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -1346,13 +1728,32 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v8, v1
+; GFX7-NEXT: v_mov_b32_e32 v7, v0
+; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
+; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB37_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
@@ -1360,25 +1761,60 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v8, v1
+; GFX8-NEXT: v_mov_b32_e32 v7, v0
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB37_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB37_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1459,25 +1895,76 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_and_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB40_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB40_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB40_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -1487,29 +1974,80 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_and_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB41_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB41_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB41_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1520,25 +2058,82 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_and_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB42_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB42_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB42_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -1548,29 +2143,82 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_and_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_and_b32_e32 v7, v9, v3
+; GFX7-NEXT: v_and_b32_e32 v6, v8, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_and_b32_e32 v7, v9, v3
+; GFX8-NEXT: v_and_b32_e32 v6, v8, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB43_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB43_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1581,37 +2229,92 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-LABEL: flat_atomic_and_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB44_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB44_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB44_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -1623,13 +2326,31 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB45_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar:
@@ -1637,25 +2358,58 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB45_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB45_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -1666,37 +2420,92 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_and_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB46_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB46_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB46_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -1708,13 +2517,31 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB47_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar:
@@ -1722,25 +2549,58 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB47_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_and_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_and_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB47_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -2771,25 +3631,76 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_or_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB60_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB60_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB60_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -2799,29 +3710,80 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_or_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB61_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB61_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB61_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -2832,25 +3794,82 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_or_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB62_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB62_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB62_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -2860,29 +3879,82 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_or_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_or_b32_e32 v7, v9, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB63_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_or_b32_e32 v7, v9, v3
+; GFX8-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB63_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB63_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -2893,37 +3965,92 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
; GFX7-LABEL: flat_atomic_or_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB64_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB64_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB64_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -2935,13 +4062,31 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB65_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar:
@@ -2949,25 +4094,58 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB65_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB65_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -2978,37 +4156,92 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
; GFX7-LABEL: flat_atomic_or_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB66_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB66_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB66_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -3020,13 +4253,31 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB67_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar:
@@ -3034,25 +4285,58 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB67_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_or_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_or_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB67_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -3133,25 +4417,76 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_xor_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB70_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB70_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB70_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -3161,29 +4496,80 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_xor_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB71_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB71_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB71_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -3194,25 +4580,82 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_xor_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB72_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB72_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB72_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -3222,29 +4665,82 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_xor_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3
+; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB73_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3
+; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB73_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB73_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -3255,37 +4751,92 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB74_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB74_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB74_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -3297,13 +4848,31 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB75_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
@@ -3311,25 +4880,58 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB75_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB75_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -3340,37 +4942,92 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB76_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB76_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB76_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -3382,13 +5039,31 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB77_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
@@ -3396,25 +5071,58 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB77_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7
+; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB77_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -4697,83 +6405,29 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB92_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB92_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB92_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB92_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB92_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -4784,85 +6438,29 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB93_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB93_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB93_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB93_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB93_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -5977,83 +7575,29 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB105_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB105_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB105_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB105_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB105_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -6064,85 +7608,29 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB106_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB106_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB106_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB106_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB106_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -6921,83 +8409,29 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out
; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB115_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB115_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB115_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -7008,85 +8442,29 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i
; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB116_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB116_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB116_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -8296,83 +9674,29 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out,
; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB129_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB129_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB129_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB129_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB129_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -8383,85 +9707,29 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6
; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB130_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB130_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB130_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB130_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB130_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
@@ -8476,25 +9744,85 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB131_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB131_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB131_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -8504,29 +9832,89 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB132_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB132_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB132_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8537,25 +9925,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB133_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB133_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB133_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -8565,29 +10019,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB134_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB134_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB134_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8598,37 +10114,101 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB135_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB135_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB135_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -8640,13 +10220,34 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB136_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
@@ -8654,25 +10255,64 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB136_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB136_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8683,37 +10323,101 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB137_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB137_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB137_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -8725,13 +10429,34 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[34:35], 0
+; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT: s_cbranch_execnz .LBB138_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
@@ -8739,25 +10464,64 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[34:35], 0
+; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT: s_cbranch_execnz .LBB138_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB138_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8838,25 +10602,91 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
+; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB141_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB141_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB141_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -8866,29 +10696,95 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6
+; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB142_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6
+; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB142_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB142_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8899,25 +10795,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
+; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB143_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB143_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB143_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -8927,29 +10895,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB144_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB144_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB144_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -8960,37 +10996,113 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v2, s4
-; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s34
+; GFX7-NEXT: v_mov_b32_e32 v4, s35
+; GFX7-NEXT: flat_load_dword v2, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v4, s4
+; GFX7-NEXT: s_mov_b64 s[38:39], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: v_mov_b32_e32 v5, s5
+; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7-NEXT: s_cbranch_execnz .LBB145_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s34
+; GFX8-NEXT: v_mov_b32_e32 v4, s35
+; GFX8-NEXT: flat_load_dword v2, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_mov_b64 s[38:39], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX8-NEXT: s_cbranch_execnz .LBB145_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB145_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret void
@@ -9002,13 +11114,38 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
-; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
+; GFX7-NEXT: v_mov_b32_e32 v4, s34
+; GFX7-NEXT: v_mov_b32_e32 v5, s35
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: flat_load_dword v2, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[38:39], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s7
+; GFX7-NEXT: v_mov_b32_e32 v7, s6
+; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
+; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7-NEXT: s_cbranch_execnz .LBB146_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
@@ -9016,25 +11153,72 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
+; GFX8-NEXT: v_mov_b32_e32 v4, s34
+; GFX8-NEXT: v_mov_b32_e32 v5, s35
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[38:39], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX8-NEXT: s_cbranch_execnz .LBB146_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB146_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
@@ -9045,37 +11229,113 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_add_u32 s34, s4, 4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v2, s34
+; GFX7-NEXT: v_mov_b32_e32 v3, s35
+; GFX7-NEXT: flat_load_dword v0, v[0:1]
+; GFX7-NEXT: flat_load_dword v1, v[2:3]
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b64 s[38:39], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: v_mov_b32_e32 v3, s5
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7-NEXT: s_cbranch_execnz .LBB147_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s34, s4, 4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v2, s34
+; GFX8-NEXT: v_mov_b32_e32 v3, s35
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_dword v1, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: s_mov_b64 s[38:39], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX8-NEXT: s_cbranch_execnz .LBB147_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB147_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
ret i64 %result
@@ -9087,13 +11347,38 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s34, s4, 32
; GFX7-NEXT: s_addc_u32 s35, s5, 0
+; GFX7-NEXT: s_add_u32 s36, s4, 36
+; GFX7-NEXT: s_addc_u32 s37, s5, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s36
+; GFX7-NEXT: v_mov_b32_e32 v1, s37
; GFX7-NEXT: v_mov_b32_e32 v2, s34
-; GFX7-NEXT: v_mov_b32_e32 v0, s6
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v3, s35
-; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[2:3]
+; GFX7-NEXT: s_mov_b64 s[38:39], 0
+; GFX7-NEXT: v_mov_b32_e32 v4, s7
+; GFX7-NEXT: v_mov_b32_e32 v5, s6
+; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
+; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7-NEXT: s_cbranch_execnz .LBB148_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
@@ -9101,25 +11386,72 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_add_u32 s34, s4, 32
; GFX8-NEXT: s_addc_u32 s35, s5, 0
+; GFX8-NEXT: s_add_u32 s36, s4, 36
+; GFX8-NEXT: s_addc_u32 s37, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s36
+; GFX8-NEXT: v_mov_b32_e32 v1, s37
; GFX8-NEXT: v_mov_b32_e32 v2, s34
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s35
-; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[2:3]
+; GFX8-NEXT: s_mov_b64 s[38:39], 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX8-NEXT: s_cbranch_execnz .LBB148_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB148_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr %out, i64 4
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
new file mode 100644
index 0000000..5f86f2e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.gfx11plus.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -stop-after=amdgpu-isel -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11
+
+@const_half = internal constant half 1.0
+
+define amdgpu_kernel void @fma_v2f16_divergent(
+ ; GFX11-LABEL: name: fma_v2f16_divergent
+ ; GFX11: bb.0 (%ir-block.0):
+ ; GFX11-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 68, 0 :: (dereferenceable invariant load (s32) from %ir.d.kernarg.offset, addrspace 4)
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1
+ ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+ ; GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.f.gep, addrspace 1)
+ ; GFX11-NEXT: [[V_AND_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_AND_B16_t16_e64 0, 32767, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
+ ; GFX11-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, -32768, 0, [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, implicit $exec
+ ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GFX11-NEXT: S_CMP_LG_U32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def $scc
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
+ ; GFX11-NEXT: [[V_CNDMASK_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CNDMASK_B16_t16_e64 0, killed [[V_XOR_B16_t16_e64_]], 0, killed [[V_AND_B16_t16_e64_]], killed [[COPY10]], 0, implicit $exec
+ ; GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_CNDMASK_B16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
+ ; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[REG_SEQUENCE4]], 8, killed [[S_LOAD_DWORD_IMM1]], 8, killed [[S_LOAD_DWORD_IMM2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ ; GFX11-NEXT: S_ENDPGM 0
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %fptr,
+ ptr addrspace(1) %b,
+ ptr addrspace(1) %c,
+ i32 %d) {
+
+ %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
+ %f = load half, ptr addrspace(1) %f.gep
+ %f.abs = call half @llvm.fabs.f16(half %f)
+ %f.neg = fneg half %f
+ %setcc = icmp ne i32 %d, 0
+ %select = select i1 %setcc, half %f.abs, half %f.neg
+ %vec = insertelement <2 x half> poison, half %select, i32 0
+ %a.val = insertelement <2 x half> %vec, half %select, i32 1
+ %b.v = load i32, ptr addrspace(1) %b
+ %b.val = bitcast i32 %b.v to <2 x half>
+ %c.v = load i32, ptr addrspace(1) %c
+ %c.val = bitcast i32 %c.v to <2 x half>
+ %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
+ store <2 x half> %r.val, ptr addrspace(1) %r
+ ret void
+}
+
+define amdgpu_kernel void @fma_v2f16_uniform(
+ ; GFX11-LABEL: name: fma_v2f16_uniform
+ ; GFX11: bb.0 (%ir-block.0):
+ ; GFX11-NEXT: liveins: $sgpr4_sgpr5
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.r.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
+ ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY8]], %subreg.sub0, killed [[COPY7]], %subreg.sub1
+ ; GFX11-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16) from %ir.3, addrspace 1)
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]]
+ ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[COPY9]]
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE2]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.4, addrspace 1)
+ ; GFX11-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[REG_SEQUENCE3]], 0, 0 :: ("amdgpu-noclobber" load (s32) from %ir.5, addrspace 1)
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_LOAD_DWORD_IMM1]]
+ ; GFX11-NEXT: [[V_PK_FMA_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_FMA_F16 0, killed [[S_MOV_B32_]], 8, killed [[S_LOAD_DWORD_IMM]], 8, [[COPY10]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[V_PK_FMA_F16_]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (store (s32) into %ir.2, addrspace 1)
+ ; GFX11-NEXT: S_ENDPGM 0
+ ptr addrspace(1) %r,
+ ptr addrspace(1) %a,
+ ptr addrspace(1) %b,
+ ptr addrspace(1) %c) {
+ %a.half = load half, ptr addrspace(1) %a
+ %vec = insertelement <2 x half> poison, half %a.half, i32 0
+ %a.val = insertelement <2 x half> %vec, half %a.half, i32 1
+ %b.v = load i32, ptr addrspace(1) %b
+ %b.val = bitcast i32 %b.v to <2 x half>
+ %c.v = load i32, ptr addrspace(1) %c
+ %c.val = bitcast i32 %c.v to <2 x half>
+ %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
+ store <2 x half> %r.val, ptr addrspace(1) %r
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 9c4901e..899cc89 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -4238,7 +4238,7 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
; GFX11-GISEL-TRUE16-LABEL: s_mul_32_f16:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e64 v0.l, 0x5000, s0
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll b/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll
new file mode 100644
index 0000000..66a8b42
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -O1 %s -o - | FileCheck %s
+
+define amdgpu_ps <4 x float> @caller(ptr %ptr) {
+; CHECK-LABEL: caller:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: flat_load_dword v1, v[0:1]
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_mov_b32 s2, 0
+; CHECK-NEXT: s_mov_b32 s5, fn@abs32@hi
+; CHECK-NEXT: s_mov_b32 s4, fn@abs32@lo
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_mov_b32 s3, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT: ; return to shader part epilog
+ %L = load i32, ptr %ptr, align 4
+ %R = call <4 x float> @fn(<4 x i32> zeroinitializer, i32 0, i32 %L, i32 0)
+ ret <4 x float> %R
+}
+
+declare hidden <4 x float> @fn(<4 x i32> inreg, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b2..5674ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i32 %arg0, -2147483648
%select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
+; GCN-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: select_fneg_xor_select_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg0 = xor i64 %arg0, 9223372036854775808
%select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
index 3e2680f..6bb68e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll
@@ -1,12 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-MUBUF,GFX90A-SDAG-MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-SDAG,GFX90A-FLATSCR,GFX90A-SDAG-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-MUBUF,GFX10-SDAG-MUBUF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG,GFX10-FLATSCR,GFX10-SDAG-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-MUBUF,GFX90A-GISEL-MUBUF %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-GISEL,GFX90A-FLATSCR,GFX90A-GISEL-FLATSCR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-MUBUF,GFX10-GISEL-MUBUF %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL,GFX10-FLATSCR,GFX10-GISEL-FLATSCR %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
; This test checks memory addresses with constant offset components that should
; not be folded into memory accesses with immediate offsets.
@@ -19,67 +27,146 @@
; FIXME the offset here should not be folded: if %p points to the beginning of
; scratch or LDS and %i is -1, a folded offset crashes the program.
define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
-; GFX90A-LABEL: flat_offset_maybe_oob:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: flat_offset_maybe_oob:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942-LABEL: flat_offset_maybe_oob:
-; GFX942: ; %bb.0:
-; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
-; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12
-; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-SDAG-LABEL: flat_offset_maybe_oob:
+; GFX90A-SDAG: ; %bb.0:
+; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_offset_maybe_oob:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: flat_offset_maybe_oob:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: flat_offset_maybe_oob:
+; GFX942-SDAG: ; %bb.0:
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: flat_offset_maybe_oob:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
-; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
-; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: flat_offset_maybe_oob:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: flat_offset_maybe_oob:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-GISEL-LABEL: flat_offset_maybe_oob:
+; GFX90A-GISEL: ; %bb.0:
+; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: flat_offset_maybe_oob:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: flat_offset_maybe_oob:
+; GFX942-GISEL: ; %bb.0:
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_offset_maybe_oob:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1]
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_offset_maybe_oob:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffd
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1]
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx
%l = load i32, ptr %arrayidx
@@ -88,13 +175,13 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) {
; For MUBUF and for GFX12, folding the offset is okay.
define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
-; GFX90A-MUBUF-LABEL: private_offset_maybe_oob:
-; GFX90A-MUBUF: ; %bb.0:
-; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
-; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
-; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-SDAG-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX90A-SDAG-MUBUF: ; %bb.0:
+; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
+; GFX90A-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX90A-FLATSCR: ; %bb.0:
@@ -105,13 +192,13 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-MUBUF-LABEL: private_offset_maybe_oob:
-; GFX10-MUBUF: ; %bb.0:
-; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
-; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
-; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0)
-; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31]
+; GFX10-SDAG-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX10-SDAG-MUBUF: ; %bb.0:
+; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0
+; GFX10-SDAG-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX10-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-FLATSCR-LABEL: private_offset_maybe_oob:
; GFX10-FLATSCR: ; %bb.0:
@@ -141,19 +228,61 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: private_offset_maybe_oob:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0
-; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: private_offset_maybe_oob:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, v0
+; GFX12-SDAG-NEXT: scratch_load_b32 v0, v0, off offset:12
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-GISEL-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX90A-GISEL-MUBUF: ; %bb.0:
+; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX90A-GISEL-MUBUF-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX90A-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX90A-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-MUBUF-LABEL: private_offset_maybe_oob:
+; GFX10-GISEL-MUBUF: ; %bb.0:
+; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-MUBUF-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX10-GISEL-MUBUF-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-GISEL-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
+; GFX10-GISEL-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: private_offset_maybe_oob:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT: scratch_load_b32 v0, v0, off offset:12
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%idx = add nsw i32 %i, 3
%arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx
%l = load i32, ptr addrspace(5) %arrayidx
ret i32 %l
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX10-GISEL-FLATSCR: {{.*}}
+; GFX10-MUBUF: {{.*}}
+; GFX10-SDAG-FLATSCR: {{.*}}
+; GFX12: {{.*}}
+; GFX90A: {{.*}}
+; GFX90A-GISEL-FLATSCR: {{.*}}
+; GFX90A-MUBUF: {{.*}}
+; GFX90A-SDAG-FLATSCR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
index a079ee1..6f2e339 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir
@@ -91,8 +91,8 @@ body: |
bb.0:
; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub0_to_agpr_32
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub0
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B]].sub0, implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
; GCN-NEXT: S_ENDPGM 0
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
%1:agpr_32 = COPY %0.sub0
@@ -108,8 +108,8 @@ body: |
bb.0:
; GCN-LABEL: name: v_mov_b64_pseudo_lit_copy_sub1_to_agpr_32
; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B]].sub1
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B]].sub1, implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
; GCN-NEXT: S_ENDPGM 0
%0:vreg_64_align2 = V_MOV_B64_PSEUDO 4290672329592, implicit $exec
%1:agpr_32 = COPY %0.sub1
@@ -133,3 +133,329 @@ body: |
S_ENDPGM 0, implicit %1
...
+
+---
+name: s_mov_b32_imm_0_copy_to_agpr_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_0_copy_to_agpr_32
+ ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ACCVGPR_WRITE_B32_e64_]]
+ %0:sreg_32 = S_MOV_B32 0, implicit $exec
+ %1:agpr_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b32_imm_neg16_copy_to_agpr_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_neg16_copy_to_agpr_32
+ ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 -16, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ACCVGPR_WRITE_B32_e64_]]
+ %0:sreg_32 = S_MOV_B32 -16, implicit $exec
+ %1:agpr_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b32_imm_65_copy_to_agpr_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_65_copy_to_agpr_32
+ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65, implicit $exec
+ ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 [[S_MOV_B32_]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ACCVGPR_WRITE_B32_e64_]]
+ %0:sreg_32 = S_MOV_B32 65, implicit $exec
+ %1:agpr_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b32_imm_0_copy_to_av_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_0_copy_to_av_32
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[AV_MOV_]]
+ %0:sreg_32 = S_MOV_B32 0, implicit $exec
+ %1:av_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b32_imm_neg16_copy_to_av_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_neg16_copy_to_av_32
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO -16, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[AV_MOV_]]
+ %0:sreg_32 = S_MOV_B32 -16, implicit $exec
+ %1:av_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b32_imm_65_copy_to_av_32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b32_imm_65_copy_to_av_32
+ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65, implicit $exec
+ ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[AV_MOV_]]
+ %0:sreg_32 = S_MOV_B32 65, implicit $exec
+ %1:av_32 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_0_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_0_copy_to_areg_64
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 0, implicit $exec
+ %1:areg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_0_copy_to_areg_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_0_copy_to_areg_64_align2
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 0, implicit $exec
+ %1:areg_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_neg16_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_neg16_copy_to_areg_64
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -16, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 -16, implicit $exec
+ %1:areg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_neg16_copy_to_areg_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_neg16_copy_to_areg_64_align2
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -16, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 -16, implicit $exec
+ %1:areg_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_0_copy_to_av_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_0_copy_to_av_64
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 0, implicit $exec
+ %1:av_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_0_copy_to_av_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_0_copy_to_av_64_align2
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 0, implicit $exec
+ %1:av_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_neg16_copy_to_av_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_neg16_copy_to_av_64
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -16, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 -16, implicit $exec
+ %1:av_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_neg16_copy_to_av_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_neg16_copy_to_av_64_align2
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -16, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B64_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64 -16, implicit $exec
+ %1:av_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_areg_64
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -42949672960, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744030759878656, implicit $exec
+ %1:areg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_areg_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_areg_64_align2
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -42949672960, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744030759878656, implicit $exec
+ %1:areg_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_areg_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_areg_64
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -21474836480, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744052234715136, implicit $exec
+ %1:areg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_areg_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_areg_64_align2
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -21474836480, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744052234715136, implicit $exec
+ %1:areg_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_av_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_av_64
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -42949672960, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744030759878656, implicit $exec
+ %1:av_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_av_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_literal_32_halves_copy_to_av_64_align2
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -42949672960, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744030759878656, implicit $exec
+ %1:av_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_av_64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_av_64
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775784, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775784, implicit $exec
+ %1:av_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_av_64_align2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_mov_b64_imm_pseudo_inlineimm_32_halves_copy_to_av_64_align2
+ ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775784, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY [[S_MOV_B]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775784, implicit $exec
+ %1:av_64_align2 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
index 74c4a2d..ddf2aa3 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -63,8 +63,8 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: clear_subreg_imm_fold
- ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967288
- ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 4294967295
+ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 -8
+ ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 -1
; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
%0:sreg_64 = S_MOV_B64 -8
%1:sgpr_32 = COPY %0.sub0
@@ -191,8 +191,8 @@ body: |
bb.0:
; GCN-LABEL: name: v_mov_b32_imm_literal_copy_v_to_agpr_32
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[V_MOV_B32_e32_]]
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_]], implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
; GCN-NEXT: S_ENDPGM 0
%0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
%1:agpr_32 = COPY %0
@@ -207,9 +207,8 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: s_mov_b32_inlineimm_copy_s_to_av_32
- ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 32, implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]]
; GCN-NEXT: S_ENDPGM 0
%0:sreg_32 = S_MOV_B32 32
%1:av_32 = COPY %0
@@ -224,9 +223,8 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: v_mov_b32_inlineimm_copy_v_to_av_32
- ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_e32_]]
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 32, implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]]
; GCN-NEXT: S_ENDPGM 0
%0:vgpr_32 = V_MOV_B32_e32 32, implicit $exec
%1:av_32 = COPY %0
@@ -242,8 +240,8 @@ body: |
bb.0:
; GCN-LABEL: name: s_mov_b32_imm_literal_copy_s_to_av_32
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 999
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]]
; GCN-NEXT: S_ENDPGM 0
%0:sreg_32 = S_MOV_B32 999
%1:av_32 = COPY %0
@@ -259,8 +257,8 @@ body: |
bb.0:
; GCN-LABEL: name: v_mov_b32_imm_literal_copy_v_to_av_32
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_e32_]]
- ; GCN-NEXT: $agpr0 = COPY [[COPY]]
+ ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[V_MOV_B32_e32_]], implicit $exec
+ ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]]
; GCN-NEXT: S_ENDPGM 0
%0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
%1:av_32 = COPY %0
@@ -781,3 +779,110 @@ body: |
S_ENDPGM 0
...
+
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_physreg_agpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_physreg_agpr
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; GCN-NEXT: $agpr0_agpr1 = COPY [[AV_MOV_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ $agpr0_agpr1 = COPY %0
+ S_ENDPGM 0, implicit $agpr0_agpr1
+
+...
+
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_physreg_vgpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_physreg_vgpr
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[AV_MOV_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ $vgpr0_vgpr1 = COPY %0
+ S_ENDPGM 0, implicit $vgpr0_vgpr1
+
+...
+
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_agpr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_agpr
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64 = COPY [[AV_MOV_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ %1:areg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+# Splat value across 2 halves of register
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_0
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[AV_MOV_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ %1:vreg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+# Low and hi are different inline constants
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[AV_MOV_]]
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ %1:vreg_64 = COPY %0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value_copy_sub0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value_copy_sub0
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ %1:vgpr_32 = COPY %0.sub0
+ S_ENDPGM 0, implicit %1
+
+...
+
+---
+name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value_copy_sub1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: av_mov_b64_imm_pseudo_copy_av_64_to_virtreg_vgpr_nonsplat_value_copy_sub1
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub1
+ ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]]
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 274877906961, implicit $exec
+ %1:vgpr_32 = COPY %0.sub1
+ S_ENDPGM 0, implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index f09c257..a859cc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -644,11 +644,10 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
; GFX11-TRUE16-LABEL: fmul_pow_mul_max_pow2:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1194,13 +1193,12 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_safe:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0xff5f3992
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0x7befffff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1]
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 7fad2f4..a88b1ec 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -75,7 +75,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0
- ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
index cc43142..2f2d727 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -46,7 +46,8 @@ body: |
%2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
...
# GCN-LABEL: name: test_frameindex{{$}}
-# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70
+# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70
+# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]]
---
name: test_frameindex
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
index 76056d7..1e293c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX942
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefix=GFX1250
declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
@@ -30,6 +31,18 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -59,6 +72,18 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -88,6 +113,19 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: flat_atomic_fadd_f32_rtn_pat:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret float %ret
}
@@ -112,6 +150,15 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_v2f16_noret:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: ds_pk_add_f16 v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret void
}
@@ -137,6 +184,14 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_v2f16_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
}
@@ -161,6 +216,15 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_atomic_fadd_v2bf16_noret:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT: ds_pk_add_bf16 v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
}
@@ -186,6 +250,14 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: local_atomic_fadd_v2bf16_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index f9a24fe..3856f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -24,9 +24,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -35,9 +35,9 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
@@ -143,9 +143,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -154,9 +154,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
@@ -262,9 +262,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -273,9 +273,9 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
@@ -380,9 +380,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -391,9 +391,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
@@ -498,9 +498,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -509,9 +509,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -617,9 +617,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -628,9 +628,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
@@ -736,9 +736,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -747,9 +747,9 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
@@ -854,9 +854,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -865,9 +865,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
@@ -972,9 +972,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -983,9 +983,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -1091,9 +1091,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1102,9 +1102,9 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
@@ -1210,9 +1210,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1221,9 +1221,9 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
@@ -1328,9 +1328,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX90A-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
@@ -1339,9 +1339,9 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
; GFX942-NEXT: s_load_dword s8, s[4:5], 0x3c
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942-NEXT: v_mov_b32_e32 v2, s8
-; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
@@ -2079,9 +2079,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s2
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: ds_add_f64 v0, v[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -2090,9 +2090,9 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
; GFX942-NEXT: s_load_dword s2, s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-NEXT: ds_add_f64 v2, v[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT: ds_add_f64 v0, v[2:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_endpgm
;
@@ -2102,23 +2102,10 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s2
-; GFX1250-NEXT: s_mov_b32 s2, 0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: .LBB51_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
-; GFX1250-NEXT: s_cbranch_execnz .LBB51_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2148,24 +2135,9 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
-; GFX1250-NEXT: v_mov_b32_e32 v4, v1
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: .LBB52_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB52_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2197,24 +2169,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: .LBB53_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB53_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2246,24 +2205,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: .LBB54_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB54_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2295,24 +2241,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: .LBB55_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_f64_e32 v[4:5], 4.0, v[0:1]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[4:5], v2, v[4:5], v[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX1250-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB55_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1250-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2341,23 +2274,9 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, v0
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: .LBB56_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], 4.0, v[4:5]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB56_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -2387,24 +2306,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
-; GFX1250-NEXT: v_mov_b32_e32 v4, v1
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: .LBB57_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB57_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2434,24 +2338,9 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
; GFX1250: ; %bb.0: ; %main_body
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v2, v0
-; GFX1250-NEXT: v_mov_b32_e32 v4, v1
-; GFX1250-NEXT: ds_load_b64 v[0:1], v0
-; GFX1250-NEXT: s_mov_b32 s0, 0
-; GFX1250-NEXT: .LBB58_1: ; %atomicrmw.start
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[6:7], v[4:5]
-; GFX1250-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[6:7]
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7]
-; GFX1250-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT: s_cbranch_execnz .LBB58_1
-; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
main_body:
%ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 189b897..cdd34cb 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -223,9 +223,10 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
; GISEL-NEXT: v_mov_b32_e32 v0, 0x433
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT: s_mov_b32 s6, 0x100000
; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT: v_and_or_b32 v5, v5, v2, s6
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
@@ -587,9 +588,10 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
; GISEL-NEXT: v_mov_b32_e32 v0, 0x433
; GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT: s_mov_b32 s6, 0x100000
; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
-; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT: v_and_or_b32 v5, v5, v2, s6
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index c52fb61..40d2765 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -4372,14 +4372,13 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
@@ -4607,14 +4606,13 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
; GFX11-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2|
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 15cda62..f2fe61f 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -360,7 +360,8 @@ entry:
; s_add_i32.
; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
-; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]]
; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 0df1a0f..35913b9 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1582,28 +1582,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT: v_readfirstlane_b32 s2, v5
-; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014
-; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01
-; SI-NEXT: s_mov_b32 s1, 0xfffff
-; SI-NEXT: s_mov_b32 s0, s6
-; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT: v_not_b32_e32 v6, s0
-; SI-NEXT: v_and_b32_e32 v6, v4, v6
-; SI-NEXT: v_not_b32_e32 v7, s1
-; SI-NEXT: v_and_b32_e32 v5, v5, v7
-; SI-NEXT: s_and_b32 s0, s2, 0x80000000
-; SI-NEXT: s_cmp_lt_i32 s3, 0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT: v_mov_b32_e32 v7, s0
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: s_cmp_gt_i32 s3, 51
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT: v_readfirstlane_b32 s0, v4
+; SI-NEXT: v_readfirstlane_b32 s1, v5
+; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
+; SI-NEXT: s_add_i32 s8, s2, 0xfffffc01
+; SI-NEXT: s_mov_b32 s3, 0xfffff
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
+; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT: s_and_b32 s9, s1, 0x80000000
+; SI-NEXT: s_cmp_lt_i32 s8, 0
+; SI-NEXT: s_cselect_b32 s2, 0, s2
+; SI-NEXT: s_cselect_b32 s3, s9, s3
+; SI-NEXT: s_cmp_gt_i32 s8, 51
+; SI-NEXT: s_cselect_b32 s1, s1, s3
+; SI-NEXT: s_cselect_b32 s0, s0, s2
+; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1859,28 +1853,22 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT: v_readfirstlane_b32 s6, v5
-; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT: s_mov_b32 s5, 0xfffff
-; SI-NEXT: s_mov_b32 s4, s2
-; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT: v_not_b32_e32 v6, s4
-; SI-NEXT: v_and_b32_e32 v6, v4, v6
-; SI-NEXT: v_not_b32_e32 v7, s5
-; SI-NEXT: v_and_b32_e32 v5, v5, v7
-; SI-NEXT: s_and_b32 s4, s6, 0x80000000
-; SI-NEXT: s_cmp_lt_i32 s7, 0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT: v_mov_b32_e32 v7, s4
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: s_cmp_gt_i32 s7, 51
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v7, s6
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT: v_readfirstlane_b32 s4, v4
+; SI-NEXT: v_readfirstlane_b32 s5, v5
+; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
+; SI-NEXT: s_mov_b32 s7, 0xfffff
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
+; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT: s_and_b32 s9, s5, 0x80000000
+; SI-NEXT: s_cmp_lt_i32 s8, 0
+; SI-NEXT: s_cselect_b32 s6, 0, s6
+; SI-NEXT: s_cselect_b32 s7, s9, s7
+; SI-NEXT: s_cmp_gt_i32 s8, 51
+; SI-NEXT: s_cselect_b32 s5, s5, s7
+; SI-NEXT: s_cselect_b32 s4, s4, s6
+; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -2109,28 +2097,22 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5]
; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT: v_readfirstlane_b32 s6, v5
-; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT: s_mov_b32 s5, 0xfffff
-; SI-NEXT: s_mov_b32 s4, s2
-; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT: v_not_b32_e32 v6, s4
-; SI-NEXT: v_and_b32_e32 v6, v4, v6
-; SI-NEXT: v_not_b32_e32 v7, s5
-; SI-NEXT: v_and_b32_e32 v5, v5, v7
-; SI-NEXT: s_and_b32 s4, s6, 0x80000000
-; SI-NEXT: s_cmp_lt_i32 s7, 0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT: v_mov_b32_e32 v7, s4
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: s_cmp_gt_i32 s7, 51
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v7, s6
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT: v_readfirstlane_b32 s4, v4
+; SI-NEXT: v_readfirstlane_b32 s5, v5
+; SI-NEXT: s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT: s_add_i32 s8, s6, 0xfffffc01
+; SI-NEXT: s_mov_b32 s7, 0xfffff
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s8
+; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT: s_and_b32 s9, s5, 0x80000000
+; SI-NEXT: s_cmp_lt_i32 s8, 0
+; SI-NEXT: s_cselect_b32 s6, 0, s6
+; SI-NEXT: s_cselect_b32 s7, s9, s7
+; SI-NEXT: s_cmp_gt_i32 s8, 51
+; SI-NEXT: s_cselect_b32 s5, s5, s7
+; SI-NEXT: s_cselect_b32 s4, s4, s6
+; SI-NEXT: v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -5251,27 +5233,22 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; SI-NEXT: v_readfirstlane_b32 s8, v9
-; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
+; SI-NEXT: v_readfirstlane_b32 s0, v8
+; SI-NEXT: v_readfirstlane_b32 s1, v9
+; SI-NEXT: s_bfe_u32 s2, s1, 0xb0014
+; SI-NEXT: s_add_i32 s10, s2, 0xfffffc01
; SI-NEXT: s_mov_b32 s3, 0xfffff
-; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT: v_not_b32_e32 v10, s0
-; SI-NEXT: v_and_b32_e32 v10, v8, v10
-; SI-NEXT: v_not_b32_e32 v11, s1
-; SI-NEXT: v_and_b32_e32 v9, v9, v11
-; SI-NEXT: s_and_b32 s0, s8, 0x80000000
-; SI-NEXT: s_cmp_lt_i32 s9, 0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT: v_mov_b32_e32 v11, s0
-; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT: s_cmp_gt_i32 s9, 51
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v11, s8
-; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc
-; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; SI-NEXT: s_mov_b32 s2, s6
+; SI-NEXT: s_lshr_b64 s[8:9], s[2:3], s10
+; SI-NEXT: s_andn2_b64 s[8:9], s[0:1], s[8:9]
+; SI-NEXT: s_and_b32 s11, s1, 0x80000000
+; SI-NEXT: s_cmp_lt_i32 s10, 0
+; SI-NEXT: s_cselect_b32 s8, 0, s8
+; SI-NEXT: s_cselect_b32 s9, s11, s9
+; SI-NEXT: s_cmp_gt_i32 s10, 51
+; SI-NEXT: s_cselect_b32 s1, s1, s9
+; SI-NEXT: s_cselect_b32 s0, s0, s8
+; SI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3]
; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@@ -5287,26 +5264,20 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_nop 1
; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; SI-NEXT: v_readfirstlane_b32 s8, v7
-; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01
-; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT: v_not_b32_e32 v8, s0
-; SI-NEXT: v_and_b32_e32 v8, v6, v8
-; SI-NEXT: v_not_b32_e32 v9, s1
-; SI-NEXT: v_and_b32_e32 v7, v7, v9
-; SI-NEXT: s_and_b32 s0, s8, 0x80000000
-; SI-NEXT: s_cmp_lt_i32 s9, 0
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT: v_mov_b32_e32 v9, s0
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT: s_cmp_gt_i32 s9, 51
-; SI-NEXT: s_cselect_b64 vcc, -1, 0
-; SI-NEXT: v_mov_b32_e32 v9, s8
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
-; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; SI-NEXT: v_readfirstlane_b32 s0, v6
+; SI-NEXT: v_readfirstlane_b32 s1, v7
+; SI-NEXT: s_bfe_u32 s8, s1, 0xb0014
+; SI-NEXT: s_addk_i32 s8, 0xfc01
+; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
+; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT: s_and_b32 s9, s1, 0x80000000
+; SI-NEXT: s_cmp_lt_i32 s8, 0
+; SI-NEXT: s_cselect_b32 s2, 0, s2
+; SI-NEXT: s_cselect_b32 s3, s9, s3
+; SI-NEXT: s_cmp_gt_i32 s8, 51
+; SI-NEXT: s_cselect_b32 s1, s1, s3
+; SI-NEXT: s_cselect_b32 s0, s0, s2
+; SI-NEXT: v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1]
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index f81950b..c561924 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1274,13 +1274,13 @@ define float @v_sqrt_f32_afn_nnan_ninf_nsz(float %x) {
ret float %result
}
-define float @v_sqrt_f32__approx_func_fp_math(float %x) #2 {
+define float @v_sqrt_f32__approx_func_fp_math(float %x) {
; GCN-LABEL: v_sqrt_f32__approx_func_fp_math:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_sqrt_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call nsz float @llvm.sqrt.f32(float %x)
+ %result = call afn nsz float @llvm.sqrt.f32(float %x)
ret float %result
}
@@ -1290,7 +1290,7 @@ define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_sqrt_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
- %result = call nsz float @llvm.sqrt.f32(float %x)
+ %result = call afn nsz float @llvm.sqrt.f32(float %x)
ret float %result
}
@@ -4761,8 +4761,7 @@ declare { float, i32 } @llvm.frexp.f32.i32(float) #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nounwind willreturn memory(none) }
-attributes #2 = { "approx-func-fp-math"="true" }
-attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
+attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
attributes #5 = { "no-infs-fp-math"="true" }
!0 = !{float 0.5}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 8f3b9a5..76e15ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,136 +1,472 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=SDAG %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=GFX6,GFX6-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=pitcairn < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
define double @v_sqrt_f64(double %x) {
-; GISEL-LABEL: v_sqrt_f64:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_fneg(double %x) {
-; GISEL-LABEL: v_sqrt_f64_fneg:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_fneg:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 9
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_fneg:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 9
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_fneg:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_fneg:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%x.neg = fneg double %x
%result = call double @llvm.sqrt.f64(double %x.neg)
ret double %result
}
define double @v_sqrt_f64_fabs(double %x) {
-; GISEL-LABEL: v_sqrt_f64_fabs:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_fabs:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_fabs:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_fabs:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_fabs:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call double @llvm.fabs.f64(double %x)
%result = call double @llvm.sqrt.f64(double %x.fabs)
ret double %result
}
define double @v_sqrt_f64_fneg_fabs(double %x) {
-; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 9
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_fneg_fabs:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 9
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_fneg_fabs:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_fneg_fabs:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], -|v[0:1]|, v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call double @llvm.fabs.f64(double %x)
%x.fabs.neg = fneg double %x.fabs
%result = call double @llvm.sqrt.f64(double %x.fabs.neg)
@@ -138,159 +474,469 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
}
define double @v_sqrt_f64_ninf(double %x) {
-; GISEL-LABEL: v_sqrt_f64_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
-; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan(double %x) {
-; GISEL-LABEL: v_sqrt_f64_nnan:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_nnan:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_nnan:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_nnan:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_nnan:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan double @llvm.sqrt.f64(double %x)
ret double %result
}
define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
-; SDAG-LABEL: s_sqrt_f64:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
-; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
-; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
-; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: v_readfirstlane_b32 s1, v1
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: s_sqrt_f64:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v1, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s1, v1
-; GISEL-NEXT: ; return to shader part epilog
+; GFX6-SDAG-LABEL: s_sqrt_f64:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_sqrt_f64:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX6-GISEL-LABEL: s_sqrt_f64:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX8-GISEL-LABEL: s_sqrt_f64:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-GISEL-NEXT: ; return to shader part epilog
%result = call double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -303,63 +949,121 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
-; SDAG-LABEL: s_sqrt_f64_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
-; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
-; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
-; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: v_readfirstlane_b32 s1, v1
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: s_sqrt_f64_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v1, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s1, v1
-; GISEL-NEXT: ; return to shader part epilog
+; GFX6-SDAG-LABEL: s_sqrt_f64_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_sqrt_f64_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX6-GISEL-LABEL: s_sqrt_f64_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX8-GISEL-LABEL: s_sqrt_f64_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-GISEL-NEXT: ; return to shader part epilog
%result = call ninf double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -372,63 +1076,121 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
-; SDAG-LABEL: s_sqrt_f64_afn:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
-; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
-; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
-; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: v_readfirstlane_b32 s1, v1
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: s_sqrt_f64_afn:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v1, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s1, v1
-; GISEL-NEXT: ; return to shader part epilog
+; GFX6-SDAG-LABEL: s_sqrt_f64_afn:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_sqrt_f64_afn:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX6-GISEL-LABEL: s_sqrt_f64_afn:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX8-GISEL-LABEL: s_sqrt_f64_afn:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-GISEL-NEXT: ; return to shader part epilog
%result = call afn double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -441,63 +1203,121 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
}
define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
-; SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_bfrev_b32_e32 v1, 8
-; SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
-; SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
-; SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x260
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; SDAG-NEXT: v_readfirstlane_b32 s1, v1
-; SDAG-NEXT: ; return to shader part epilog
-;
-; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v1, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s1, v1
-; GISEL-NEXT: ; return to shader part epilog
+; GFX6-SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec
+; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x260
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX6-GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX8-GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v1, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-GISEL-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-GISEL-NEXT: ; return to shader part epilog
%result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
%cast = bitcast double %result to <2 x i32>
%cast.0 = extractelement <2 x i32> %cast, i32 0
@@ -510,819 +1330,2128 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
}
define double @v_sqrt_f64_nsz(double %x) {
-; GISEL-LABEL: v_sqrt_f64_nsz:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_nsz:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_nsz:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_nsz:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_nsz:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan_ninf(double %x) {
-; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_nnan_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_nnan_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_nnan_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
-; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nnan ninf nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_afn(double %x) {
-; GISEL-LABEL: v_sqrt_f64_afn:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_afn:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_afn:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_afn:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_afn:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_afn_nsz(double %x) {
-; GISEL-LABEL: v_sqrt_f64_afn_nsz:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_afn_nsz:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_afn_nsz:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_afn_nsz:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
-; SDAG-LABEL: v_sqrt_v2f64_afn:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, 0
-; SDAG-NEXT: s_brev_b32 s5, 8
-; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x100
-; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
-; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
-; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
-; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_sqrt_v2f64_afn:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s4, 0
-; GISEL-NEXT: s_brev_b32 s5, 8
-; GISEL-NEXT: v_mov_b32_e32 v4, s4
-; GISEL-NEXT: v_mov_b32_e32 v5, s5
-; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
-; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_v2f64_afn:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_v2f64_afn:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_v2f64_afn:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_v2f64_afn:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define double @v_sqrt_f64_afn_nnan(double %x) {
-; GISEL-LABEL: v_sqrt_f64_afn_nnan:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_afn_nnan:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_afn_nnan:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_afn_nnan:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
-; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], |v[0:1]|, v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%fabs = call double @llvm.fabs.f64(double %x)
%result = call afn ninf double @llvm.sqrt.f64(double %fabs)
ret double %result
}
define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
-; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
-; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, 0
-; SDAG-NEXT: s_brev_b32 s5, 8
-; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x100
-; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
-; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
-; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
-; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s4, 0
-; GISEL-NEXT: s_brev_b32 s5, 8
-; GISEL-NEXT: v_mov_b32_e32 v4, s4
-; GISEL-NEXT: v_mov_b32_e32 v5, s5
-; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
-; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
-; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
-define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
-; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
- %result = call nsz double @llvm.sqrt.f64(double %x)
- ret double %result
-}
-
define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
-; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
-; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GISEL-NEXT: v_bfrev_b32_e32 v3, 8
-; GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
-; GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
-; GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
-; GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
-; GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_f64__unsafe_attr:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_f64__unsafe_attr:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v3, 8
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v2, 8, v2
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call nsz double @llvm.sqrt.f64(double %x)
ret double %result
}
define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
-; SDAG-LABEL: v_sqrt_v2f64:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s4, 0
-; SDAG-NEXT: s_brev_b32 s5, 8
-; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; SDAG-NEXT: v_mov_b32_e32 v4, 0x100
-; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
-; SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
-; SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
-; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
-; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
-; SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
-; SDAG-NEXT: v_mov_b32_e32 v9, 0x260
-; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_sqrt_v2f64:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s4, 0
-; GISEL-NEXT: s_brev_b32 s5, 8
-; GISEL-NEXT: v_mov_b32_e32 v4, s4
-; GISEL-NEXT: v_mov_b32_e32 v5, s5
-; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
-; GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
-; GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
-; GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
-; GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
-; GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
-; GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
-; GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
-; GISEL-NEXT: v_mov_b32_e32 v9, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
-; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
-; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_v2f64:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX6-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_v2f64:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s5, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x100
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v5
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[0:1], v[4:5]
+; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[4:5], 0.5
+; GFX8-SDAG-NEXT: v_mul_f64 v[10:11], v[2:3], v[6:7]
+; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[8:9], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], -v[10:11], v[10:11], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[8:9]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[14:15], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_v2f64:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_v2f64:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[0:1], v[4:5]
+; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[4:5], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[10:11], v[6:7], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[6:7], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[12:13], v[8:9], v[4:5]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, 0xffffff80
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v9
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
ret <2 x double> %result
}
define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
-; SDAG-LABEL: v_sqrt_v3f64:
-; SDAG: ; %bb.0:
-; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: s_mov_b32 s6, 0
-; SDAG-NEXT: s_brev_b32 s7, 8
-; SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
-; SDAG-NEXT: v_mov_b32_e32 v10, 0x100
-; SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
-; SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
-; SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
-; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
-; SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7]
-; SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
-; SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
-; SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
-; SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
-; SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
-; SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
-; SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9]
-; SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
-; SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5
-; SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
-; SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13]
-; SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5
-; SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5
-; SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
-; SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
-; SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11]
-; SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3]
-; SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
-; SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15]
-; SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17]
-; SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
-; SDAG-NEXT: v_mov_b32_e32 v15, 0x260
-; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
-; SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
-; SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
-; SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
-; SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
-; SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
-; SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
-; SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: v_sqrt_v3f64:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: s_mov_b32 s4, 0
-; GISEL-NEXT: s_brev_b32 s5, 8
-; GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
-; GISEL-NEXT: v_mov_b32_e32 v6, s4
-; GISEL-NEXT: v_mov_b32_e32 v7, s5
-; GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
-; GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
-; GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
-; GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1]
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
-; GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
-; GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
-; GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
-; GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5
-; GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9]
-; GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5
-; GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
-; GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5
-; GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5
-; GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
-; GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
-; GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9]
-; GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3]
-; GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
-; GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11]
-; GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
-; GISEL-NEXT: v_mov_b32_e32 v15, 0x260
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
-; GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
-; GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
-; GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
-; GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
-; GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
-; GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
-; GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: v_sqrt_v3f64:
+; GFX6-SDAG: ; %bb.0:
+; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX6-SDAG-NEXT: s_brev_b32 s7, 8
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v10, 0x100
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; GFX6-SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
+; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; GFX6-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; GFX6-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX6-SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9]
+; GFX6-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; GFX6-SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13]
+; GFX6-SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5
+; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; GFX6-SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3]
+; GFX6-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; GFX6-SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15]
+; GFX6-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17]
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; GFX6-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; GFX6-SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX6-SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX6-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_sqrt_v3f64:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: s_mov_b32 s6, 0
+; GFX8-SDAG-NEXT: s_brev_b32 s7, 8
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v10, 0x100
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
+; GFX8-SDAG-NEXT: v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, 0, v10, vcc
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v6
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[6:7]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v10
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1]
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[8:9], v[2:3]
+; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; GFX8-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7]
+; GFX8-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5
+; GFX8-SDAG-NEXT: v_mul_f64 v[14:15], v[2:3], v[8:9]
+; GFX8-SDAG-NEXT: v_mul_f64 v[8:9], v[8:9], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[10:11], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[8:9], v[14:15], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[16:17], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; GFX8-SDAG-NEXT: v_mul_f64 v[16:17], v[4:5], v[12:13]
+; GFX8-SDAG-NEXT: v_mul_f64 v[12:13], v[12:13], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[18:19], v[8:9]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[12:13], v[16:17], 0.5
+; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; GFX8-SDAG-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[14:15], v[14:15], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[14:15], v[18:19], v[8:9], v[14:15]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[16:17], v[18:19], v[12:13], v[16:17]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[0:1]
+; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[10:11]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], -v[14:15], v[14:15], v[2:3]
+; GFX8-SDAG-NEXT: v_fma_f64 v[18:19], -v[16:17], v[16:17], v[4:5]
+; GFX8-SDAG-NEXT: v_fma_f64 v[8:9], v[10:11], v[8:9], v[14:15]
+; GFX8-SDAG-NEXT: v_fma_f64 v[10:11], v[18:19], v[12:13], v[16:17]
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; GFX8-SDAG-NEXT: v_mov_b32_e32 v15, 0x260
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; GFX8-SDAG-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX8-SDAG-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: v_sqrt_v3f64:
+; GFX6-GISEL: ; %bb.0:
+; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX6-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX6-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
+; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9]
+; GFX6-GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; GFX6-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5
+; GFX6-GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX6-GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3]
+; GFX6-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GFX6-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11]
+; GFX6-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13]
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; GFX6-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; GFX6-GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX6-GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_sqrt_v3f64:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX8-GISEL-NEXT: s_brev_b32 s5, 8
+; GFX8-GISEL-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], v[0:1], v6
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GFX8-GISEL-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v8, 8, v8
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v8
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[8:9], v[0:1]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v6, 8, v6
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[10:11], v[2:3]
+; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[8:9], v[0:1], v[8:9]
+; GFX8-GISEL-NEXT: v_mul_f64 v[14:15], v[10:11], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[10:11], v[2:3], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[16:17], -v[6:7], v[8:9], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[14:15], v[10:11], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[16:17], v[6:7]
+; GFX8-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], 0.5
+; GFX8-GISEL-NEXT: v_mul_f64 v[12:13], v[4:5], v[12:13]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[16:17], v[12:13], 0.5
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], v[12:13], v[18:19], v[12:13]
+; GFX8-GISEL-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[18:19], v[6:7], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[10:11], v[10:11], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[14:15], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[12:13], v[18:19], v[16:17], v[12:13]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[8:9], v[8:9], v[0:1]
+; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], v[18:19], v[6:7], v[8:9]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], -v[10:11], v[10:11], v[2:3]
+; GFX8-GISEL-NEXT: v_fma_f64 v[18:19], -v[12:13], v[12:13], v[4:5]
+; GFX8-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[10:11]
+; GFX8-GISEL-NEXT: v_fma_f64 v[10:11], v[18:19], v[16:17], v[12:13]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v14, 0xffffff80
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v15, 0x260
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v13, 0, v14, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, v14, s[6:7]
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[6:7], v[6:7], v12
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v15
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[8:9], v[8:9], v13
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v15
+; GFX8-GISEL-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
+; GFX8-GISEL-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], v15
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7]
+; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x)
ret <3 x double> %result
}
@@ -1335,6 +3464,8 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nounwind willreturn memory(none) }
-attributes #2 = { "approx-func-fp-math"="true" }
-attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
+attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
attributes #4 = { "unsafe-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX6: {{.*}}
+; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 95e28a3..3c41cc4 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1107,21 +1107,19 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v4i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1190,22 +1188,20 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v5i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1285,29 +1281,27 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v8i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6
; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1422,47 +1416,44 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v16i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v16i8:
@@ -1658,83 +1649,77 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32
; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index b24ebbd9..9db7600 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -38,9 +38,9 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
; CHECK-NEXT: s_mov_b64 s[4:5], -1
; CHECK-NEXT: s_cbranch_vccz .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %atomicrmw.global
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: global_atomic_add_f64 v0, v[2:3], s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_mov_b64 s[4:5], 0
@@ -62,9 +62,9 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b
; CHECK-NEXT: .LBB0_8: ; %atomicrmw.shared
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b32 s0, s0, -1
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT: ds_add_f64 v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: ds_add_f64 v0, v[2:3]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 2fdc1a8..f67ab18 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -559,33 +559,61 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_call_external_void_func_i8_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8@abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8@abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i8@abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i8@abs32@lo
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i8@abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i8@abs32@lo
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm:
; GFX10-SCRATCH: ; %bb.0:
@@ -978,33 +1006,61 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_call_external_void_func_i16_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16@abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16@abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7b
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_i16@abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_i16@abs32@lo
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_i16@abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_i16@abs32@lo
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm:
; GFX10-SCRATCH: ; %bb.0:
@@ -2161,33 +2217,61 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: test_call_external_void_func_f16_imm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: v_writelane_b32 v40, s0, 2
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16@abs32@hi
-; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16@abs32@lo
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400
+; GFX11-TRUE16-NEXT: s_mov_b32 s1, external_void_func_f16@abs32@hi
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, external_void_func_f16@abs32@lo
+; GFX11-TRUE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT: s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_f16_imm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, external_void_func_f16@abs32@hi
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, external_void_func_f16@abs32@lo
+; GFX11-FAKE16-NEXT: s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm:
; GFX10-SCRATCH: ; %bb.0:
@@ -4896,23 +4980,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
@@ -5156,30 +5239,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5441,36 +5523,34 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5910,85 +5990,77 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, v34
; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off
-; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 5c183f5..b750d28 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -106,11 +106,17 @@ define amdgpu_gfx i16 @return_i16() #0 {
; GFX9-NEXT: v_mov_b32_e32 v0, 10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: return_i16:
-; GFX10PLUS: ; %bb.0: ; %entry
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 10
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: return_i16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 10
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: return_i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b16_e32 v0.l, 10
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
ret i16 10
}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx10plus-wavefront-sgpr-count.ll b/llvm/test/CodeGen/AMDGPU/gfx10plus-wavefront-sgpr-count.ll
new file mode 100644
index 0000000..1826a51
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/gfx10plus-wavefront-sgpr-count.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -filetype=obj < %s 2>&1 | llvm-objdump -d --section=.rodata - | FileCheck %s
+
+; CHECK-NOT: error
+define amdgpu_kernel void @test(i128 inreg) {
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
index d13d76f..fcdba69 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@@ -86,15 +86,3 @@ entry:
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
ret void
}
-
-; GCN: scratch_atomic_store:
-; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN: .amdhsa_kernel scratch_atomic_store
-; CU: .amdhsa_uses_cu_stores 1
-; NOCU: .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
- store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
index d1e82a0..99025f0 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll
@@ -39,20 +39,19 @@ define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
ret void
}
-; TODO: handle
define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
; GCN-LABEL: test_flat_store_noalias_addrspace:
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GCN-NEXT: flat_store_b32 v[0:1], v2
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: s_set_pc_i64 s[30:31]
store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
ret void
}
-; TODO: would be nice to handle too
+; TODO: would be nice to handle
define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
; GCN-SDAG-LABEL: test_flat_store_select:
; GCN-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index c2ddce4..85549b8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -146,7 +146,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX942-NEXT: successors: %bb.3(0x80000000)
; GFX90A_GFX942-NEXT: {{ $}}
; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY %1
+ ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:av_32 = COPY %1
; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F32_SADDR killed [[V_MOV_B32_e32_1]], [[COPY8]], [[COPY3]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr, addrspace 1)
; GFX90A_GFX942-NEXT: {{ $}}
; GFX90A_GFX942-NEXT: bb.3.Flow:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
index 682c1cd..0288455 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @global_atomic_fadd_f64_no_rtn_atomicrmw(ptr addrspace(1)
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
- ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
@@ -105,7 +105,7 @@ define amdgpu_ps void @global_atomic_fadd_f64_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; GFX90A_GFX942-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE1]]
; GFX90A_GFX942-NEXT: GLOBAL_ATOMIC_ADD_F64_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY4]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, addrspace 1)
; GFX90A_GFX942-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 1f74fbd..9c1f9d2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -8275,13 +8275,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8394,13 +8393,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -8700,13 +8698,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8823,13 +8820,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -9138,13 +9134,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -9262,13 +9257,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -9576,11 +9570,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -9690,11 +9684,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -9985,11 +9979,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10103,11 +10097,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -10406,11 +10400,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10525,11 +10519,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -10819,10 +10813,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -10908,10 +10901,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -11144,8 +11136,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -11228,8 +11220,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -11464,13 +11456,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -11589,13 +11580,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -11906,11 +11896,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -12026,11 +12016,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index faa74fe..f7cc070 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -5832,13 +5832,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5954,13 +5953,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -6265,13 +6263,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6394,13 +6391,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -6713,13 +6709,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6843,13 +6838,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -7151,11 +7145,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7246,11 +7240,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -7494,10 +7488,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7586,10 +7579,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -7838,14 +7830,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -7972,14 +7964,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -8303,13 +8295,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -8434,13 +8425,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index a46b012..b81af1f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -5832,13 +5832,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5954,13 +5953,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -6265,13 +6263,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6394,13 +6391,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -6713,13 +6709,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6843,13 +6838,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -7151,11 +7145,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7246,11 +7240,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -7494,10 +7488,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7586,10 +7579,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -7838,14 +7830,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -7972,14 +7964,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
@@ -8303,13 +8295,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -8434,13 +8425,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 053efdc..b8762d13 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -5221,13 +5221,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5340,13 +5339,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -5646,13 +5644,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5769,13 +5766,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -6084,13 +6080,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6208,13 +6203,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -6522,11 +6516,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6636,11 +6630,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -6931,11 +6925,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7049,11 +7043,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -7352,11 +7346,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7471,11 +7465,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
@@ -7765,10 +7759,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -7854,10 +7847,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -8090,8 +8082,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -8174,8 +8166,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
@@ -8410,13 +8402,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -8535,13 +8526,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
@@ -8852,11 +8842,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
@@ -8972,11 +8962,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
new file mode 100644
index 0000000..f92ba7a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+define amdgpu_kernel void @zextload_global_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: zextload_global_i8_to_i16:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: zextload_global_i8_to_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %in
+ %ext = zext i8 %a to i16
+ store i16 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: sextload_global_i8_to_i16:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_i8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sextload_global_i8_to_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %in
+ %ext = sext i8 %a to i16
+ store i16 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: zextload_global_i8_to_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_u8 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: zextload_global_i8_to_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %in
+ %ext = zext i8 %a to i64
+ store i64 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: sextload_global_i8_to_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_i8 v0, v2, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sextload_global_i8_to_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i8, ptr addrspace(1) %in
+ %ext = sext i8 %a to i64
+ store i64 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @zextload_global_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-LABEL: zextload_global_i16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %in
+ %ext = zext i16 %a to i32
+ store i32 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sextload_global_i16_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-LABEL: sextload_global_i16_to_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_i16 v1, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %in
+ %ext = sext i16 %a to i32
+ store i32 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @zextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: zextload_global_i16_to_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: zextload_global_i16_to_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %in
+ %ext = zext i16 %a to i64
+ store i64 %ext, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @sextload_global_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
+; GFX11-REAL16-LABEL: sextload_global_i16_to_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-REAL16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-REAL16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: sextload_global_i16_to_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
+ %a = load i16, ptr addrspace(1) %in
+ %ext = sext i16 %a to i64
+ store i64 %ext, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index f2da966..57bfd249 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -145,12 +145,12 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-LABEL: half4:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX1030-LABEL: half4:
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 3a898a9..e532dea 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
-; Test S_WAIT_XCNT insertion for global_load/store instructions.
+; Test S_WAIT_XCNT insertion for global_load/store clauses.
; Introduced additional operations in between the clauses to have the register dependency
; between the operands of VMEM operations and the def ops of VALU instructions that followed.
@@ -123,29 +123,10 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
-; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
+; GCN-SDAG-NEXT: v_dual_mov_b32 v39, v4 :: v_dual_mov_b32 v38, v3
+; GCN-SDAG-NEXT: s_clause 0xf
+; GCN-SDAG-NEXT: global_load_b128 v[2:5], v[0:1], off offset:224
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:160
@@ -155,137 +136,103 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:96
; GCN-SDAG-NEXT: global_load_b128 v[48:51], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[52:55], v[0:1], off offset:64
-; GCN-SDAG-NEXT: global_load_b128 v[38:41], v[0:1], off offset:80
-; GCN-SDAG-NEXT: global_load_b128 v[42:45], v[0:1], off offset:32
-; GCN-SDAG-NEXT: global_load_b128 v[56:59], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[60:63], v[0:1], off
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
-; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:56 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:224
-; GCN-SDAG-NEXT: scratch_load_b128 v[6:9], off, s32 offset:72 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0xe
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:240
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:192
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:208
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:160
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:176
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:128
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off offset:144
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:112
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[52:55], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[38:41], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[42:45], off offset:32
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[56:59], off offset:48
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[60:63], off
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
-; GCN-SDAG-NEXT: s_clause 0xd
-; GCN-SDAG-NEXT: scratch_load_b32 v63, off, s32
-; GCN-SDAG-NEXT: scratch_load_b32 v62, off, s32 offset:4
-; GCN-SDAG-NEXT: scratch_load_b32 v61, off, s32 offset:8
-; GCN-SDAG-NEXT: scratch_load_b32 v60, off, s32 offset:12
-; GCN-SDAG-NEXT: scratch_load_b32 v59, off, s32 offset:16
-; GCN-SDAG-NEXT: scratch_load_b32 v58, off, s32 offset:20
-; GCN-SDAG-NEXT: scratch_load_b32 v57, off, s32 offset:24
-; GCN-SDAG-NEXT: scratch_load_b32 v56, off, s32 offset:28
-; GCN-SDAG-NEXT: scratch_load_b32 v45, off, s32 offset:32
-; GCN-SDAG-NEXT: scratch_load_b32 v44, off, s32 offset:36
-; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32 offset:40
-; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:44
-; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:48
-; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:52
-; GCN-SDAG-NEXT: s_wait_xcnt 0xe
-; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GCN-SDAG-NEXT: global_load_b128 v[64:67], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[68:71], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[80:83], v[0:1], off offset:48
+; GCN-SDAG-NEXT: global_load_b128 v[84:87], v[0:1], off
+; GCN-SDAG-NEXT: global_load_b128 v[96:99], v[0:1], off offset:16
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xf
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[2:5], off offset:224
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xe
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[6:9], off offset:240
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xd
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off offset:192
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xc
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[14:17], off offset:208
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xb
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[18:21], off offset:160
+; GCN-SDAG-NEXT: s_wait_loadcnt 0xa
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off offset:176
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x9
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[26:29], off offset:128
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x8
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[30:33], off offset:144
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[34:37], off offset:96
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[48:51], off offset:112
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[52:55], off offset:64
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[64:67], off offset:80
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[68:71], off offset:32
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[80:83], off offset:48
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[84:87], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[96:99], off offset:16
+; GCN-SDAG-NEXT: s_wait_xcnt 0x10
+; GCN-SDAG-NEXT: v_mov_b32_e32 v0, v98
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_v64i32_load_store:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
+; GCN-GISEL-NEXT: v_dual_mov_b32 v38, v3 :: v_dual_mov_b32 v39, v4
; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
-; GCN-GISEL-NEXT: s_wait_xcnt 0x8
-; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
-; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
-; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80
-; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:112
-; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:128
-; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:144
-; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:160
-; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:176
-; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:192
-; GCN-GISEL-NEXT: global_load_b128 v[38:41], v[0:1], off offset:208
-; GCN-GISEL-NEXT: global_load_b128 v[42:45], v[0:1], off offset:224
-; GCN-GISEL-NEXT: global_load_b128 v[56:59], v[0:1], off
-; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
-; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: s_clause 0xe
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:32
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[6:9], off offset:48
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[10:13], off offset:64
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[14:17], off offset:80
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[18:21], off offset:96
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[22:25], off offset:112
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[26:29], off offset:128
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[30:33], off offset:144
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[34:37], off offset:160
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[48:51], off offset:176
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[52:55], off offset:192
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[38:41], off offset:208
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[42:45], off offset:224
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[56:59], off
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[60:63], off offset:16
-; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT: global_store_b128 v[46:47], v[0:3], off offset:240
-; GCN-GISEL-NEXT: s_wait_xcnt 0x0
-; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v62
-; GCN-GISEL-NEXT: s_clause 0xf
-; GCN-GISEL-NEXT: scratch_load_b32 v63, off, s32
-; GCN-GISEL-NEXT: scratch_load_b32 v62, off, s32 offset:4
-; GCN-GISEL-NEXT: scratch_load_b32 v61, off, s32 offset:8
-; GCN-GISEL-NEXT: scratch_load_b32 v60, off, s32 offset:12
-; GCN-GISEL-NEXT: scratch_load_b32 v59, off, s32 offset:16
-; GCN-GISEL-NEXT: scratch_load_b32 v58, off, s32 offset:20
-; GCN-GISEL-NEXT: scratch_load_b32 v57, off, s32 offset:24
-; GCN-GISEL-NEXT: scratch_load_b32 v56, off, s32 offset:28
-; GCN-GISEL-NEXT: scratch_load_b32 v47, off, s32 offset:32
-; GCN-GISEL-NEXT: scratch_load_b32 v46, off, s32 offset:36
-; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32 offset:40
-; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:44
-; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:48
-; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:52
-; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:56
-; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:60
+; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
+; GCN-GISEL-NEXT: global_load_b128 v[14:17], v[0:1], off offset:16
+; GCN-GISEL-NEXT: global_load_b128 v[18:21], v[0:1], off offset:64
+; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:80
+; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96
+; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112
+; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:128
+; GCN-GISEL-NEXT: global_load_b128 v[48:51], v[0:1], off offset:144
+; GCN-GISEL-NEXT: global_load_b128 v[52:55], v[0:1], off offset:160
+; GCN-GISEL-NEXT: global_load_b128 v[64:67], v[0:1], off offset:176
+; GCN-GISEL-NEXT: global_load_b128 v[68:71], v[0:1], off offset:192
+; GCN-GISEL-NEXT: global_load_b128 v[80:83], v[0:1], off offset:208
+; GCN-GISEL-NEXT: global_load_b128 v[84:87], v[0:1], off offset:224
+; GCN-GISEL-NEXT: global_load_b128 v[96:99], v[0:1], off offset:240
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xf
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[2:5], off offset:32
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xe
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[6:9], off offset:48
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xd
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xc
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[14:17], off offset:16
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xb
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[18:21], off offset:64
+; GCN-GISEL-NEXT: s_wait_loadcnt 0xa
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[22:25], off offset:80
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x9
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[26:29], off offset:96
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x8
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[30:33], off offset:112
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x7
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[34:37], off offset:128
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[48:51], off offset:144
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[52:55], off offset:160
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x4
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[64:67], off offset:176
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x3
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[68:71], off offset:192
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x2
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[80:83], off offset:208
+; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[84:87], off offset:224
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
+; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[96:99], off offset:240
+; GCN-GISEL-NEXT: s_wait_xcnt 0x10
+; GCN-GISEL-NEXT: v_mov_b32_e32 v0, v16
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
store <64 x i32> %vec, ptr addrspace(1) %out, align 4
@@ -298,99 +245,78 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
-; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
-; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
-; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
-; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:48
-; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:32
+; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:112
+; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:96
+; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:80
+; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:48
+; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32
; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16
; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off
; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[24:25], 0x70
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 0x60
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 32
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[40:41], 16
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x50
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[42:43], 0
-; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 64
-; GCN-SDAG-NEXT: v_dual_mov_b32 v22, 0xc8 :: v_dual_mov_b32 v23, 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50
+; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64
+; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
; GCN-SDAG-NEXT: s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[24:25], v[10:13], off
+; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x6
-; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[18:21], off
+; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x5
; GCN-SDAG-NEXT: s_wait_xcnt 0x1
-; GCN-SDAG-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v9
+; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-SDAG-NEXT: s_wait_loadcnt 0x4
-; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[18:21], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x3
-; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[14:17], off
+; GCN-SDAG-NEXT: global_store_b128 v[50:51], v[22:25], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x2
-; GCN-SDAG-NEXT: global_store_b128 v[40:41], v[26:29], off
+; GCN-SDAG-NEXT: global_store_b128 v[64:65], v[26:29], off
; GCN-SDAG-NEXT: s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[42:43], v[30:33], off
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT: s_wait_xcnt 0x3
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[52:53], v[2:3], v[2:3]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: global_store_b128 v[66:67], v[30:33], off
+; GCN-SDAG-NEXT: s_wait_xcnt 0x0
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x2
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], 0x64, v[16:17]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x1
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
+; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[28:29], v[28:29], v[28:29]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[26:27], v[26:27], v[26:27]
-; GCN-SDAG-NEXT: s_wait_xcnt 0x0
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[36:37], v[36:37], v[36:37]
-; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[34:35], v[34:35], v[34:35]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
+; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[22:25], off
-; GCN-SDAG-NEXT: global_store_b128 v[48:49], v[0:3], off
+; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off
+; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off
; GCN-SDAG-NEXT: s_clause 0x7
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:96
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:112
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[50:53], off offset:64
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32
-; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[34:37], off offset:48
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[48:51], off offset:64
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:80
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[22:25], off offset:32
+; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off
; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT: s_clause 0x3
-; GCN-SDAG-NEXT: scratch_load_b32 v43, off, s32
-; GCN-SDAG-NEXT: scratch_load_b32 v42, off, s32 offset:4
-; GCN-SDAG-NEXT: scratch_load_b32 v41, off, s32 offset:8
-; GCN-SDAG-NEXT: scratch_load_b32 v40, off, s32 offset:12
-; GCN-SDAG-NEXT: s_wait_xcnt 0xc
+; GCN-SDAG-NEXT: s_wait_xcnt 0x8
; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_v16i64_load_store:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
-; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off
@@ -404,11 +330,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16
; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32
; GCN-GISEL-NEXT: v_mov_b64_e32 v[52:53], 48
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x60
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[66:67], 0x60
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[68:69], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x70
; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[40:41], 0x50
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[64:65], 0x50
; GCN-GISEL-NEXT: s_wait_loadcnt 0x6
; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x5
@@ -418,13 +344,13 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt 0x3
; GCN-GISEL-NEXT: global_store_b128 v[52:53], v[22:25], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x2
-; GCN-GISEL-NEXT: global_store_b128 v[42:43], v[26:29], off
+; GCN-GISEL-NEXT: global_store_b128 v[66:67], v[26:29], off
; GCN-GISEL-NEXT: s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT: global_store_b128 v[44:45], v[30:33], off
-; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT: global_store_b128 v[68:69], v[30:33], off
; GCN-GISEL-NEXT: s_wait_xcnt 0x5
-; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
+; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
; GCN-GISEL-NEXT: s_wait_xcnt 0x4
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
@@ -447,7 +373,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
; GCN-GISEL-NEXT: s_clause 0x1
; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT: global_store_b128 v[40:41], v[34:37], off
+; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[34:37], off
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16
@@ -457,15 +383,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96
; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT: s_clause 0x5
-; GCN-GISEL-NEXT: scratch_load_b32 v45, off, s32
-; GCN-GISEL-NEXT: scratch_load_b32 v44, off, s32 offset:4
-; GCN-GISEL-NEXT: scratch_load_b32 v43, off, s32 offset:8
-; GCN-GISEL-NEXT: scratch_load_b32 v42, off, s32 offset:12
-; GCN-GISEL-NEXT: scratch_load_b32 v41, off, s32 offset:16
-; GCN-GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:20
+; GCN-GISEL-NEXT: s_wait_xcnt 0x9
; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
-; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
%in_a = insertelement <16 x i64> %a, i64 100, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index 1def479..d297955 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -985,7 +985,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1024,7 +1024,7 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1061,7 +1061,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw sub ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1098,7 +1098,7 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw sub ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1135,7 +1135,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1174,7 +1174,7 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1211,7 +1211,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw sub ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1248,7 +1248,7 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw sub ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1289,7 +1289,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1328,7 +1328,7 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1365,7 +1365,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw and ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1402,7 +1402,7 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw and ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1439,7 +1439,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1478,7 +1478,7 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1515,7 +1515,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw and ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1552,7 +1552,7 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw and ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1593,7 +1593,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1632,7 +1632,7 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %s
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1669,7 +1669,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw or ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1706,7 +1706,7 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw or ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1743,7 +1743,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(ptr addrspace(1) inreg %sb
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1782,7 +1782,7 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(ptr addrspace(1) in
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -1819,7 +1819,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw or ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1856,7 +1856,7 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw or ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1897,7 +1897,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1936,7 +1936,7 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -1973,7 +1973,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw xor ptr addrspace(1) %gep0, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2010,7 +2010,7 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst
+ %unused = atomicrmw xor ptr addrspace(1) %gep1, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2047,7 +2047,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2086,7 +2086,7 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %rtn = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2123,7 +2123,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw xor ptr addrspace(1) %gep0, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2160,7 +2160,7 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst
+ %unused = atomicrmw xor ptr addrspace(1) %gep1, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2198,7 +2198,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2234,7 +2234,7 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2267,7 +2267,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2300,7 +2300,7 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2334,7 +2334,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2370,7 +2370,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2403,7 +2403,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2436,7 +2436,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2474,7 +2474,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2510,7 +2510,7 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2543,7 +2543,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2576,7 +2576,7 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2610,7 +2610,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2646,7 +2646,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2679,7 +2679,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2712,7 +2712,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2750,7 +2750,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2786,7 +2786,7 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -2819,7 +2819,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2852,7 +2852,7 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2886,7 +2886,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2922,7 +2922,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -2955,7 +2955,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -2988,7 +2988,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3026,7 +3026,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3062,7 +3062,7 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3095,7 +3095,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3128,7 +3128,7 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3162,7 +3162,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3198,7 +3198,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3231,7 +3231,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3264,7 +3264,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst
+ %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3664,7 +3664,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3690,7 +3690,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3712,7 +3712,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3734,7 +3734,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3758,7 +3758,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3784,7 +3784,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3806,7 +3806,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3828,7 +3828,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3857,7 +3857,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3883,7 +3883,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3905,7 +3905,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3927,7 +3927,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -3951,7 +3951,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX12-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3977,7 +3977,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3999,7 +3999,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX12-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -4021,8 +4021,10 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data syncscope("agent") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
attributes #0 = { argmemonly nounwind willreturn }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 2aa198f..da132d0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -824,7 +824,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -893,7 +893,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -971,7 +971,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1060,7 +1060,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1117,7 +1117,7 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1185,7 +1185,7 @@ define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspa
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1346,7 +1346,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1404,7 +1404,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1473,7 +1473,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1551,7 +1551,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1640,7 +1640,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1697,7 +1697,7 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1765,7 +1765,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspa
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1840,7 +1840,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1926,7 +1926,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -1984,7 +1984,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2049,7 +2049,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2120,7 +2120,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2205,7 +2205,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2255,7 +2255,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2319,7 +2319,7 @@ define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspa
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2387,7 +2387,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2469,7 +2469,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2520,7 +2520,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2585,7 +2585,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2656,7 +2656,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out,
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2741,7 +2741,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2791,7 +2791,7 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2855,7 +2855,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -2923,7 +2923,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3005,7 +3005,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3056,7 +3056,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3121,7 +3121,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3192,7 +3192,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3277,7 +3277,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3327,7 +3327,7 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3391,7 +3391,7 @@ define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspa
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3459,7 +3459,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3541,7 +3541,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3592,7 +3592,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3657,7 +3657,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3728,7 +3728,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out,
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3813,7 +3813,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %o
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3863,7 +3863,7 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3927,7 +3927,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -3995,7 +3995,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4077,7 +4077,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -4135,7 +4135,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4204,7 +4204,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -4282,7 +4282,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i3
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4371,7 +4371,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -4428,7 +4428,7 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4496,7 +4496,7 @@ define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -4571,7 +4571,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4657,7 +4657,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -4715,7 +4715,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5978,7 +5978,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -6047,7 +6047,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -6125,7 +6125,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -6214,7 +6214,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -6271,7 +6271,7 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -6339,7 +6339,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspa
; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_endpgm
entry:
- %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -6414,7 +6414,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in,
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -6500,7 +6500,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
- %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -8233,7 +8233,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8294,7 +8294,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out,
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8353,7 +8353,7 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8418,7 +8418,7 @@ define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8487,7 +8487,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -8565,7 +8565,7 @@ define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8654,7 +8654,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -8712,7 +8712,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in)
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8773,7 +8773,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out,
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8832,7 +8832,7 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8897,7 +8897,7 @@ define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -8966,7 +8966,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -9044,7 +9044,7 @@ define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -9133,7 +9133,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
- %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
+ %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %out2
ret void
}
@@ -9437,3 +9437,243 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
store bfloat %val, ptr addrspace(1) %out
ret void
}
+
+define amdgpu_kernel void @atomic_sub_i16_soffset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i16 %in) {
+; SI-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dword s2, s[4:5], 0xb
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s6, s0, 0x4650
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: s_and_b32 s0, s6, -4
+; SI-NEXT: s_and_b32 s6, s6, 3
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_load_dword s9, s[0:1], 0x0
+; SI-NEXT: s_lshl_b32 s7, s6, 3
+; SI-NEXT: s_lshl_b32 s6, 0xffff, s7
+; SI-NEXT: s_lshl_b32 s7, s2, s7
+; SI-NEXT: s_not_b32 s8, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: .LBB136_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1
+; SI-NEXT: v_and_b32_e32 v0, s6, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, s8, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT: s_cbranch_execnz .LBB136_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s3, s0, 0x4650
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_and_b32 s0, s3, -4
+; VI-NEXT: s_load_dword s9, s[0:1], 0x0
+; VI-NEXT: s_and_b32 s3, s3, 3
+; VI-NEXT: s_lshl_b32 s3, s3, 3
+; VI-NEXT: s_lshl_b32 s6, 0xffff, s3
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_not_b32 s7, s6
+; VI-NEXT: s_lshl_b32 s8, s2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: .LBB136_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1
+; VI-NEXT: v_and_b32_e32 v2, s7, v1
+; VI-NEXT: v_and_b32_e32 v0, s6, v0
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v1, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB136_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: atomic_sub_i16_soffset__amdgpu_no_remote_memory:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_u32 s3, s0, 0x4650
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_and_b32 s0, s3, -4
+; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
+; GFX9-NEXT: s_and_b32 s3, s3, 3
+; GFX9-NEXT: s_lshl_b32 s3, s3, 3
+; GFX9-NEXT: s_lshl_b32 s4, 0xffff, s3
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: s_not_b32 s5, s4
+; GFX9-NEXT: s_lshl_b32 s6, s2, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB136_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+ %gep = getelementptr i16, ptr addrspace(1) %out, i64 9000
+ %val = atomicrmw sub ptr addrspace(1) %gep, i16 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i8_soffset__amdgpu_no_remote_memory(ptr addrspace(1) %out, i8 %in) {
+; SI-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_load_dword s2, s[4:5], 0xb
+; SI-NEXT: s_mov_b64 s[4:5], 0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s6, s0, 0x2328
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: s_and_b32 s0, s6, -4
+; SI-NEXT: s_and_b32 s6, s6, 3
+; SI-NEXT: s_and_b32 s2, s2, 0xff
+; SI-NEXT: s_load_dword s9, s[0:1], 0x0
+; SI-NEXT: s_lshl_b32 s7, s6, 3
+; SI-NEXT: s_lshl_b32 s6, 0xff, s7
+; SI-NEXT: s_lshl_b32 s7, s2, s7
+; SI-NEXT: s_not_b32 s8, s6
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: .LBB137_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s7, v1
+; SI-NEXT: v_and_b32_e32 v0, s6, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, s8, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT: s_cbranch_execnz .LBB137_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s3, s0, 0x2328
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_and_b32 s0, s3, -4
+; VI-NEXT: s_load_dword s9, s[0:1], 0x0
+; VI-NEXT: s_and_b32 s3, s3, 3
+; VI-NEXT: s_lshl_b32 s3, s3, 3
+; VI-NEXT: s_lshl_b32 s6, 0xff, s3
+; VI-NEXT: s_and_b32 s2, s2, 0xff
+; VI-NEXT: s_not_b32 s7, s6
+; VI-NEXT: s_lshl_b32 s8, s2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: .LBB137_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, s8, v1
+; VI-NEXT: v_and_b32_e32 v2, s7, v1
+; VI-NEXT: v_and_b32_e32 v0, s6, v0
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v1, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB137_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: atomic_sub_i8_soffset__amdgpu_no_remote_memory:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_u32 s3, s0, 0x2328
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-NEXT: s_and_b32 s0, s3, -4
+; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
+; GFX9-NEXT: s_and_b32 s3, s3, 3
+; GFX9-NEXT: s_lshl_b32 s3, s3, 3
+; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3
+; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: s_not_b32 s5, s4
+; GFX9-NEXT: s_lshl_b32 s6, s2, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB137_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+ %gep = getelementptr i8, ptr addrspace(1) %out, i64 9000
+ %val = atomicrmw sub ptr addrspace(1) %gep, i8 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index a867c6c1a..ffab568 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -1284,26 +1284,68 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB30_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB30_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_sub v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB30_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB30_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB30_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -1317,9 +1359,25 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB31_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB31_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1328,17 +1386,43 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_sub v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB31_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB31_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB31_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1353,27 +1437,71 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB32_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB32_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB32_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB32_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB32_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -1387,29 +1515,72 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB33_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB33_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB33_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB33_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB33_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1421,23 +1592,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB34_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB34_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1447,20 +1632,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_sub v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB34_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB34_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB34_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -1471,23 +1680,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB35_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB35_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1499,20 +1722,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_sub v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB35_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB35_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB35_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1524,23 +1771,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB36_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB36_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1550,20 +1811,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB36_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB36_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB36_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -1574,23 +1861,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB37_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB37_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1600,22 +1901,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB37_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB37_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB37_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1744,26 +2069,68 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB41_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB41_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_and v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB41_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB41_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB41_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -1777,9 +2144,25 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB42_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB42_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1788,17 +2171,43 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_and v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB42_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB42_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB42_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1813,27 +2222,71 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB43_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB43_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB43_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_and_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB43_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB43_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -1847,29 +2300,72 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB44_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB44_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB44_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_and_b32_e32 v0, v1, v2
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB44_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB44_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1881,23 +2377,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB45_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0
+; SI-NEXT: v_and_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB45_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1907,20 +2417,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_and v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB45_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB45_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_and v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB45_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -1931,23 +2465,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB46_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_and_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB46_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1959,20 +2507,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_and v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB46_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB46_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB46_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1984,23 +2556,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB47_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB47_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2010,20 +2596,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB47_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_and_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB47_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_and_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB47_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -2034,23 +2646,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB48_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB48_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2060,22 +2686,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB48_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_and_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB48_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_and_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB48_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3030,26 +3680,68 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB61_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB61_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_or v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB61_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB61_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB61_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -3063,9 +3755,25 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB62_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB62_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3074,17 +3782,43 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_or v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB62_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB62_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB62_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3099,27 +3833,71 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB63_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB63_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB63_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_or_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB63_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_or_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB63_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -3133,29 +3911,72 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB64_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB64_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB64_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_or_b32_e32 v0, v1, v2
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB64_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_or_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB64_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3167,23 +3988,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB65_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0
+; SI-NEXT: v_or_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB65_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3193,20 +4028,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_or v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB65_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB65_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_or v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB65_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -3217,23 +4076,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB66_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_or_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB66_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3245,20 +4118,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_or v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB66_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB66_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB66_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3270,23 +4167,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB67_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB67_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3296,20 +4207,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB67_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_or_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB67_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_or_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB67_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -3320,23 +4257,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB68_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB68_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3346,22 +4297,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB68_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_or_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB68_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_or_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB68_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3490,26 +4465,68 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB72_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB72_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_xor v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB72_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB72_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB72_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -3523,9 +4540,25 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB73_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v3, v4, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB73_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3534,17 +4567,43 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_xor v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB73_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB73_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB73_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3559,27 +4618,71 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB74_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB74_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB74_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_xor_b32_e32 v3, v4, v2
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB74_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB74_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -3593,29 +4696,72 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB75_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v4, v5, v2
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB75_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB75_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_xor_b32_e32 v0, v1, v2
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB75_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB75_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3627,23 +4773,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB76_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0
+; SI-NEXT: v_xor_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB76_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3653,20 +4813,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_xor v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB76_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB76_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB76_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -3677,23 +4861,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB77_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_xor_b32_e32 v0, s34, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB77_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3705,20 +4903,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_xor v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB77_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v2, s6, v3
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB77_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB77_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3730,23 +4952,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB78_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB78_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3756,20 +4992,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB78_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_xor_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB78_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB78_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -3780,23 +5042,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB79_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, s34, v2
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB79_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3806,22 +5082,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB79_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_xor_b32_e32 v3, s6, v4
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB79_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB79_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -5041,25 +6341,9 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB95_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB95_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5068,43 +6352,17 @@ define void @global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB95_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_i32_e32 v3, v4, v2
-; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; VI-NEXT: flat_atomic_smax v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v4, v3
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB95_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i32_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_smax v[0:1], v2, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB95_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -5119,72 +6377,29 @@ define i32 @global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB96_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_max_i32_e32 v4, v5, v2
-; SI-NEXT: v_mov_b32_e32 v3, v4
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB96_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[3:4]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB96_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_max_i32_e32 v0, v1, v2
-; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB96_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i32_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: v_max_i32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB96_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%result = atomicrmw max ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6199,25 +7414,9 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB108_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_max_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB108_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -6226,43 +7425,17 @@ define void @global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB108_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_max_u32_e32 v3, v4, v2
-; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; VI-NEXT: flat_atomic_umax v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v4, v3
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB108_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i32_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_umax v[0:1], v2, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB108_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6277,72 +7450,29 @@ define i32 @global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB109_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_max_u32_e32 v4, v5, v2
-; SI-NEXT: v_mov_b32_e32 v3, v4
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB109_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[3:4]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB109_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_max_u32_e32 v0, v1, v2
-; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB109_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: v_max_u32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB109_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%result = atomicrmw umax ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7032,25 +8162,9 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB118_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_u32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB118_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -7059,43 +8173,17 @@ define void @global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr ad
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB118_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_u32_e32 v3, v4, v2
-; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; VI-NEXT: flat_atomic_umin v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v4, v3
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB118_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i32_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_umin v[0:1], v2, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB118_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7110,72 +8198,29 @@ define i32 @global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr addrs
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB119_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_min_u32_e32 v4, v5, v2
-; SI-NEXT: v_mov_b32_e32 v3, v4
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB119_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[3:4]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB119_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_min_u32_e32 v0, v1, v2
-; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB119_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: v_min_u32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB119_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%result = atomicrmw umin ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -8272,25 +9317,9 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB132_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_min_i32_e32 v3, v4, v2
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v6, v4
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB132_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8299,43 +9328,17 @@ define void @global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory(ptr add
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB132_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_min_i32_e32 v3, v4, v2
-; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; VI-NEXT: flat_atomic_smin v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v4, v3
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB132_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i32_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_smin v[0:1], v2, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB132_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -8350,72 +9353,29 @@ define i32 @global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB133_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_min_i32_e32 v4, v5, v2
-; SI-NEXT: v_mov_b32_e32 v3, v4
-; SI-NEXT: v_mov_b32_e32 v4, v5
-; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB133_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[3:4]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB133_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: v_min_i32_e32 v0, v1, v2
-; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB133_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i32_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, v3
-; GFX9-NEXT: v_min_i32_e32 v3, v4, v2
-; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
+; GFX9-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB133_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%result = atomicrmw min ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -8434,26 +9394,74 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB134_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB134_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB134_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB134_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v4
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB134_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -8467,9 +9475,27 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB135_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB135_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8478,17 +9504,47 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB135_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB135_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v4
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB135_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8503,27 +9559,77 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB136_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB136_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB136_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB136_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v4
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB136_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -8537,29 +9643,78 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB137_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2
+; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB137_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB137_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1
+; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB137_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_add_u32_e32 v3, 1, v4
+; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB137_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8571,23 +9726,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB138_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1
+; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB138_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8597,20 +9768,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB138_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3
+; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB138_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v1
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB138_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -8621,23 +9820,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v4, s6, 0
+; SI-NEXT: v_writelane_b32 v4, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB139_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1
+; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v3, v1
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB139_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v4, 1
+; SI-NEXT: v_readlane_b32 s6, v4, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8649,20 +9864,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB139_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3
+; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB139_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v1
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB139_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8674,23 +9917,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB140_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB140_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8700,20 +9959,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB140_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4
+; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB140_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v3
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB140_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -8724,23 +10013,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v3, s6, 0
+; SI-NEXT: v_writelane_b32 v3, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB141_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v2, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, v2
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB141_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v3, 1
+; SI-NEXT: v_readlane_b32 s6, v3, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -8750,22 +10055,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB141_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4
+; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB141_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v0
+; GFX9-NEXT: v_add_u32_e32 v0, 1, v3
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB141_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8853,30 +10186,84 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-LABEL: global_atomic_udec_wrap_i32_noret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB144_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB144_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i32_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: .LBB144_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_cbranch_execnz .LBB144_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[6:7]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec v[0:1], v2, off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GFX9-NEXT: v_add_u32_e32 v3, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execnz .LBB144_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -8886,13 +10273,33 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB145_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v4
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB145_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -8901,17 +10308,51 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: .LBB145_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_cbranch_execnz .LBB145_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[6:7]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GFX9-NEXT: v_add_u32_e32 v3, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execnz .LBB145_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8922,31 +10363,87 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-LABEL: global_atomic_udec_wrap_i32_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB146_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB146_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i32_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: .LBB146_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, v3
+; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_cbranch_execnz .LBB146_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GFX9-NEXT: v_add_u32_e32 v3, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execnz .LBB146_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -8956,33 +10453,88 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB147_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v3, v4
+; SI-NEXT: v_mov_b32_e32 v4, v5
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB147_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[3:4]
+; VI-NEXT: s_mov_b64 s[6:7], 0
+; VI-NEXT: .LBB147_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; VI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; VI-NEXT: s_cbranch_execnz .LBB147_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[6:7]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
+; GFX9-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2
+; GFX9-NEXT: v_add_u32_e32 v3, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_cbranch_execnz .LBB147_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8994,23 +10546,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v5, s6, 0
+; SI-NEXT: v_writelane_b32 v5, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v2, s34
+; SI-NEXT: .LBB148_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, v1
+; SI-NEXT: v_mov_b32_e32 v3, v0
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB148_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v5, 1
+; SI-NEXT: v_readlane_b32 s6, v5, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9020,20 +10591,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: .LBB148_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_cbranch_execnz .LBB148_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[36:37]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1
+; GFX9-NEXT: v_add_u32_e32 v0, -1, v1
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX9-NEXT: s_cbranch_execnz .LBB148_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[36:37]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
@@ -9044,23 +10649,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v5, s6, 0
+; SI-NEXT: v_writelane_b32 v5, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v2, s34
+; SI-NEXT: .LBB149_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v4, v1
+; SI-NEXT: v_mov_b32_e32 v3, v0
+; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB149_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v5, 1
+; SI-NEXT: v_readlane_b32 s6, v5, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9072,20 +10696,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s34
; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: .LBB149_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: v_mov_b32_e32 v3, v2
+; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_cbranch_execnz .LBB149_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[36:37]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1
+; GFX9-NEXT: v_add_u32_e32 v0, -1, v1
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX9-NEXT: s_cbranch_execnz .LBB149_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[36:37]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -9097,23 +10755,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v5, s6, 0
+; SI-NEXT: v_writelane_b32 v5, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v2, s34
+; SI-NEXT: .LBB150_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v1, v4
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB150_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v5, 1
+; SI-NEXT: v_readlane_b32 s6, v5, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9123,20 +10800,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v3, s6
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: .LBB150_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_cbranch_execnz .LBB150_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[36:37]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4
+; GFX9-NEXT: v_add_u32_e32 v0, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX9-NEXT: s_cbranch_execnz .LBB150_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[36:37]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
ret i32 %result
@@ -9147,23 +10860,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v1, s6, 0
-; SI-NEXT: v_writelane_b32 v1, s7, 1
+; SI-NEXT: v_writelane_b32 v5, s6, 0
+; SI-NEXT: v_writelane_b32 v5, s7, 1
; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s34
+; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v2, s34
+; SI-NEXT: .LBB151_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v3
+; SI-NEXT: v_mov_b32_e32 v1, v4
+; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v1, 1
-; SI-NEXT: v_readlane_b32 s6, v1, 0
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB151_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v5, 1
+; SI-NEXT: v_readlane_b32 s6, v5, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9173,22 +10905,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 16
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s34
-; VI-NEXT: v_mov_b32_e32 v1, s35
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
+; VI-NEXT: v_mov_b32_e32 v1, s34
+; VI-NEXT: v_mov_b32_e32 v2, s35
+; VI-NEXT: flat_load_dword v0, v[1:2]
+; VI-NEXT: s_mov_b64 s[36:37], 0
+; VI-NEXT: v_mov_b32_e32 v3, s6
+; VI-NEXT: .LBB151_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; VI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; VI-NEXT: s_cbranch_execnz .LBB151_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[36:37]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16
+; GFX9-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: .LBB151_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4
+; GFX9-NEXT: v_add_u32_e32 v0, -1, v4
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4
+; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; GFX9-NEXT: s_cbranch_execnz .LBB151_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[36:37]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 778fc2e..483880b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -623,7 +623,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -695,7 +695,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -771,7 +771,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -853,7 +853,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -912,7 +912,7 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -983,7 +983,7 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1056,7 +1056,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1135,7 +1135,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1191,7 +1191,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1339,7 +1339,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1421,7 +1421,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1480,7 +1480,7 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1551,7 +1551,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1624,7 +1624,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1703,7 +1703,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1753,7 +1753,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1822,7 +1822,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -1892,7 +1892,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -1971,7 +1971,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2024,7 +2024,7 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2092,7 +2092,7 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2159,7 +2159,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2235,7 +2235,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2285,7 +2285,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2354,7 +2354,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2424,7 +2424,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2503,7 +2503,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2556,7 +2556,7 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2624,7 +2624,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2691,7 +2691,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2767,7 +2767,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2817,7 +2817,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -2886,7 +2886,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -2956,7 +2956,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3035,7 +3035,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3088,7 +3088,7 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3156,7 +3156,7 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3223,7 +3223,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3349,7 +3349,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3418,7 +3418,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3488,7 +3488,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3567,7 +3567,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3620,7 +3620,7 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3688,7 +3688,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3755,7 +3755,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3831,7 +3831,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
+ %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -3887,7 +3887,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -3959,7 +3959,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4035,7 +4035,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4117,7 +4117,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4176,7 +4176,7 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4247,7 +4247,7 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4320,7 +4320,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4399,7 +4399,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4455,7 +4455,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4510,7 +4510,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr double, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4565,7 +4565,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4637,7 +4637,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4713,7 +4713,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4795,7 +4795,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4854,7 +4854,7 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -4925,7 +4925,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -4998,7 +4998,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5077,7 +5077,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -5133,7 +5133,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5205,7 +5205,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -5281,7 +5281,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5363,7 +5363,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -5422,7 +5422,7 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5493,7 +5493,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_endpgm
entry:
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -5566,7 +5566,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -5645,7 +5645,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
- %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -7146,7 +7146,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -7218,7 +7218,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -7294,7 +7294,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -7349,7 +7349,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -7421,7 +7421,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
store i64 %tmp0, ptr addrspace(1) %out2
ret void
}
@@ -7497,6 +7497,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
- %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
+ %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index a7f1644..74f0f64 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -1329,26 +1329,76 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB30_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB30_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB30_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB30_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB30_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -1362,9 +1412,29 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB31_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
+; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB31_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1373,17 +1443,47 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB31_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB31_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB31_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1394,32 +1494,88 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_sub_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB32_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7
+; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB32_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB32_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
+; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB32_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB32_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -1429,34 +1585,88 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
; SI-LABEL: global_atomic_sub_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB33_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7
+; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB33_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_sub_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB33_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2
+; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB33_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB33_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1468,25 +1678,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v9, s6, 0
+; SI-NEXT: v_writelane_b32 v9, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: .LBB34_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2
+; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v8, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v6, v1
+; SI-NEXT: v_mov_b32_e32 v5, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v5
+; SI-NEXT: v_mov_b32_e32 v3, v6
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB34_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v9, 1
+; SI-NEXT: v_readlane_b32 s6, v9, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1494,24 +1722,54 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
; VI-LABEL: global_atomic_sub_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB34_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB34_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB34_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -1522,23 +1780,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v9, s6, 0
+; SI-NEXT: v_writelane_b32 v9, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: .LBB35_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2
+; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v8, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v6, v1
+; SI-NEXT: v_mov_b32_e32 v5, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v5
+; SI-NEXT: v_mov_b32_e32 v3, v6
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB35_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v9, 1
+; SI-NEXT: v_readlane_b32 s6, v9, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1548,24 +1826,52 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: .LBB35_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
+; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB35_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB35_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1577,25 +1883,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v9, s6, 0
+; SI-NEXT: v_writelane_b32 v9, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: .LBB36_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v8, v1
+; SI-NEXT: v_mov_b32_e32 v7, v0
+; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7
+; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v5
+; SI-NEXT: v_mov_b32_e32 v1, v6
+; SI-NEXT: v_mov_b32_e32 v2, v7
+; SI-NEXT: v_mov_b32_e32 v3, v8
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB36_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v9, 1
+; SI-NEXT: v_readlane_b32 s6, v9, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1603,24 +1927,54 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-LABEL: global_atomic_sub_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB36_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, v1
+; VI-NEXT: v_mov_b32_e32 v7, v0
+; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB36_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB36_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -1631,23 +1985,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v9, s6, 0
+; SI-NEXT: v_writelane_b32 v9, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: .LBB37_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v8, v1
+; SI-NEXT: v_mov_b32_e32 v7, v0
+; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7
+; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v5
+; SI-NEXT: v_mov_b32_e32 v1, v6
+; SI-NEXT: v_mov_b32_e32 v2, v7
+; SI-NEXT: v_mov_b32_e32 v3, v8
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB37_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v9, 1
+; SI-NEXT: v_readlane_b32 s6, v9, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1658,23 +2032,51 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: .LBB37_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v8, v1
+; VI-NEXT: v_mov_b32_e32 v7, v0
+; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
+; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB37_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB37_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1767,26 +2169,76 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB40_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, v7, v3
+; SI-NEXT: v_and_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB40_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB40_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v5, v7, v3
+; VI-NEXT: v_and_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB40_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB40_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -1800,9 +2252,29 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB41_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v5, v7, v3
+; SI-NEXT: v_and_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB41_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1811,17 +2283,47 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB41_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v5, v7, v3
+; VI-NEXT: v_and_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB41_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB41_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1832,32 +2334,88 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_and_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB42_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_and_b32_e32 v9, v11, v6
+; SI-NEXT: v_and_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB42_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB42_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_and_b32_e32 v5, v7, v3
+; VI-NEXT: v_and_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB42_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB42_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -1867,34 +2425,88 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
; SI-LABEL: global_atomic_and_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB43_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_and_b32_e32 v9, v11, v6
+; SI-NEXT: v_and_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB43_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_and_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB43_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_and_b32_e32 v7, v9, v3
+; VI-NEXT: v_and_b32_e32 v6, v8, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB43_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB43_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1906,25 +2518,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB44_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_and_b32_e32 v1, s34, v3
+; SI-NEXT: v_and_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB44_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1932,24 +2561,52 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
; VI-LABEL: global_atomic_and_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB44_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, s7, v3
+; VI-NEXT: v_and_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB44_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB44_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -1960,23 +2617,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB45_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_and_b32_e32 v1, s34, v3
+; SI-NEXT: v_and_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB45_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1986,24 +2662,50 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB45_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_and_b32_e32 v1, s7, v3
+; VI-NEXT: v_and_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB45_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB45_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2015,25 +2717,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB46_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, s34, v5
+; SI-NEXT: v_and_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB46_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2041,24 +2760,52 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-LABEL: global_atomic_and_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB46_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_and_b32_e32 v5, s7, v7
+; VI-NEXT: v_and_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB46_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_and_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_and_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB46_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -2069,23 +2816,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB47_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_and_b32_e32 v3, s34, v5
+; SI-NEXT: v_and_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB47_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2096,23 +2862,49 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB47_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_and_b32_e32 v5, s7, v7
+; VI-NEXT: v_and_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB47_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_and_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_and_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB47_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3197,26 +3989,76 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB60_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v5, v7, v3
+; SI-NEXT: v_or_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB60_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB60_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_or_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB60_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB60_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -3230,9 +4072,29 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB61_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v5, v7, v3
+; SI-NEXT: v_or_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB61_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3241,17 +4103,47 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB61_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_or_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB61_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB61_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3262,32 +4154,88 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_or_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB62_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_or_b32_e32 v9, v11, v6
+; SI-NEXT: v_or_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB62_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB62_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_or_b32_e32 v5, v7, v3
+; VI-NEXT: v_or_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB62_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB62_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -3297,34 +4245,88 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
; SI-LABEL: global_atomic_or_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB63_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_or_b32_e32 v9, v11, v6
+; SI-NEXT: v_or_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB63_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_or_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB63_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_or_b32_e32 v7, v9, v3
+; VI-NEXT: v_or_b32_e32 v6, v8, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB63_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB63_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3336,25 +4338,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB64_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_or_b32_e32 v1, s34, v3
+; SI-NEXT: v_or_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB64_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3362,24 +4381,52 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
; VI-LABEL: global_atomic_or_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB64_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v1, s7, v3
+; VI-NEXT: v_or_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB64_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB64_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -3390,23 +4437,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB65_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_or_b32_e32 v1, s34, v3
+; SI-NEXT: v_or_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB65_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3416,24 +4482,50 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB65_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v1, s7, v3
+; VI-NEXT: v_or_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB65_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB65_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3445,25 +4537,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB66_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v3, s34, v5
+; SI-NEXT: v_or_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB66_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3471,24 +4580,52 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
; VI-LABEL: global_atomic_or_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB66_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_or_b32_e32 v5, s7, v7
+; VI-NEXT: v_or_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB66_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_or_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_or_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB66_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -3499,23 +4636,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB67_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_or_b32_e32 v3, s34, v5
+; SI-NEXT: v_or_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB67_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3526,23 +4682,49 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB67_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_or_b32_e32 v5, s7, v7
+; VI-NEXT: v_or_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB67_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_or_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_or_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB67_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3635,26 +4817,76 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB70_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v5, v7, v3
+; SI-NEXT: v_xor_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB70_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB70_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v5, v7, v3
+; VI-NEXT: v_xor_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB70_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB70_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -3668,9 +4900,29 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB71_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v5, v7, v3
+; SI-NEXT: v_xor_b32_e32 v4, v6, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB71_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3679,17 +4931,47 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB71_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v5, v7, v3
+; VI-NEXT: v_xor_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB71_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB71_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3700,32 +4982,88 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_xor_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB72_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_xor_b32_e32 v9, v11, v6
+; SI-NEXT: v_xor_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB72_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB72_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_xor_b32_e32 v5, v7, v3
+; VI-NEXT: v_xor_b32_e32 v4, v6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB72_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB72_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -3735,34 +5073,88 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
; SI-LABEL: global_atomic_xor_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v6, v3
+; SI-NEXT: v_mov_b32_e32 v7, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB73_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: v_xor_b32_e32 v9, v11, v6
+; SI-NEXT: v_xor_b32_e32 v8, v10, v7
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB73_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_xor_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB73_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_xor_b32_e32 v7, v9, v3
+; VI-NEXT: v_xor_b32_e32 v6, v8, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB73_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB73_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3774,25 +5166,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB74_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_xor_b32_e32 v1, s34, v3
+; SI-NEXT: v_xor_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB74_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3800,24 +5209,52 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
; VI-LABEL: global_atomic_xor_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB74_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v1, s7, v3
+; VI-NEXT: v_xor_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB74_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB74_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -3828,23 +5265,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB75_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_xor_b32_e32 v1, s34, v3
+; SI-NEXT: v_xor_b32_e32 v0, s35, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB75_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3854,24 +5310,50 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB75_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v1, s7, v3
+; VI-NEXT: v_xor_b32_e32 v0, s6, v2
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB75_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB75_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3883,25 +5365,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
; SI-NEXT: s_mov_b32 s34, s7
; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB76_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v3, s34, v5
+; SI-NEXT: v_xor_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB76_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3909,24 +5408,52 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
; VI-LABEL: global_atomic_xor_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB76_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_xor_b32_e32 v5, s7, v7
+; VI-NEXT: v_xor_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB76_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB76_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -3937,23 +5464,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: s_mov_b32 s34, s7
+; SI-NEXT: s_mov_b32 s35, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB77_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_xor_b32_e32 v3, s34, v5
+; SI-NEXT: v_xor_b32_e32 v2, s35, v4
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB77_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3964,23 +5510,49 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB77_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_xor_b32_e32 v5, s7, v7
+; VI-NEXT: v_xor_b32_e32 v4, s6, v6
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB77_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6
+; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB77_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -5347,30 +6919,9 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB92_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
-; SI-NEXT: v_mov_b32_e32 v9, v5
-; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_smax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB92_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5379,49 +6930,17 @@ define void @global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB92_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB92_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[2:3], off offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB92_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -5432,91 +6951,34 @@ define i64 @global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp
; SI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: v_mov_b32_e32 v4, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB93_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v1
-; SI-NEXT: v_mov_b32_e32 v10, v0
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, v8
-; SI-NEXT: v_mov_b32_e32 v1, v9
-; SI-NEXT: v_mov_b32_e32 v2, v10
-; SI-NEXT: v_mov_b32_e32 v3, v11
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_smax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB93_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB93_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v9, v1
-; VI-NEXT: v_mov_b32_e32 v8, v0
-; VI-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB93_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB93_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw max ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6697,30 +8159,9 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB105_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
-; SI-NEXT: v_mov_b32_e32 v9, v5
-; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_umax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB105_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -6729,49 +8170,17 @@ define void @global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB105_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB105_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[2:3], off offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB105_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -6782,91 +8191,34 @@ define i64 @global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs
; SI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: v_mov_b32_e32 v4, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB106_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v1
-; SI-NEXT: v_mov_b32_e32 v10, v0
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, v8
-; SI-NEXT: v_mov_b32_e32 v1, v9
-; SI-NEXT: v_mov_b32_e32 v2, v10
-; SI-NEXT: v_mov_b32_e32 v3, v11
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_umax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB106_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB106_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v9, v1
-; VI-NEXT: v_mov_b32_e32 v8, v0
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB106_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB106_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw umax ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7683,30 +9035,9 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB115_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
-; SI-NEXT: v_mov_b32_e32 v9, v5
-; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_umin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB115_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -7715,49 +9046,17 @@ define void @global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr ad
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB115_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB115_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[2:3], off offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB115_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -7768,91 +9067,34 @@ define i64 @global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs
; SI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: v_mov_b32_e32 v4, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB116_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v1
-; SI-NEXT: v_mov_b32_e32 v10, v0
-; SI-NEXT: v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, v8
-; SI-NEXT: v_mov_b32_e32 v1, v9
-; SI-NEXT: v_mov_b32_e32 v2, v10
-; SI-NEXT: v_mov_b32_e32 v3, v11
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_umin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB116_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB116_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v9, v1
-; VI-NEXT: v_mov_b32_e32 v8, v0
-; VI-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB116_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB116_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw umin ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -9134,30 +10376,9 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB129_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v7
-; SI-NEXT: v_mov_b32_e32 v10, v6
-; SI-NEXT: v_mov_b32_e32 v9, v5
-; SI-NEXT: v_mov_b32_e32 v8, v4
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_smin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: v_mov_b32_e32 v7, v9
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB129_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9166,49 +10387,17 @@ define void @global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr add
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB129_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; VI-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB129_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[2:3], off offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB129_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -9219,91 +10408,34 @@ define i64 @global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp
; SI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v5, v3
-; SI-NEXT: v_mov_b32_e32 v4, v2
-; SI-NEXT: v_mov_b32_e32 v7, v1
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
-; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB130_1: ; %atomicrmw.start
-; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v11, v1
-; SI-NEXT: v_mov_b32_e32 v10, v0
-; SI-NEXT: v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v9, v5, v11, vcc
-; SI-NEXT: v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, v8
-; SI-NEXT: v_mov_b32_e32 v1, v9
-; SI-NEXT: v_mov_b32_e32 v2, v10
-; SI-NEXT: v_mov_b32_e32 v3, v11
-; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_atomic_smin_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
-; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB130_1
-; SI-NEXT: ; %bb.2: ; %atomicrmw.end
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB130_1: ; %atomicrmw.start
-; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v9, v1
-; VI-NEXT: v_mov_b32_e32 v8, v0
-; VI-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
-; VI-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
-; VI-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
-; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB130_1
-; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start
-; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v5
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
+; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB130_1
-; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw min ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0
@@ -9322,26 +10454,85 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB131_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB131_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB131_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB131_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB131_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -9355,9 +10546,32 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB132_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6
+; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB132_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9366,17 +10580,53 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB132_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB132_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB132_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9387,32 +10637,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_uinc_wrap_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_mov_b32_e32 v4, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB133_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB133_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB133_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB133_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB133_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -9422,34 +10737,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_mov_b32_e32 v4, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: .LBB134_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_cbranch_execnz .LBB134_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: .LBB134_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_cbranch_execnz .LBB134_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB134_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9461,25 +10839,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB135_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB135_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9487,24 +10885,58 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB135_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB135_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB135_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -9515,23 +10947,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v8, s6, 0
+; SI-NEXT: v_writelane_b32 v8, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB136_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v7, v3
+; SI-NEXT: v_mov_b32_e32 v6, v2
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB136_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v8, 1
+; SI-NEXT: v_readlane_b32 s6, v8, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9541,24 +10995,56 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB136_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB136_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB136_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9570,25 +11056,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB137_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB137_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9596,24 +11102,58 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[34:35], 0
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB137_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB137_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB137_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -9624,23 +11164,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v6, s6, 0
+; SI-NEXT: v_writelane_b32 v6, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[36:37], 0
+; SI-NEXT: .LBB138_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v5, v1
+; SI-NEXT: v_mov_b32_e32 v4, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_mov_b32_e32 v2, v4
+; SI-NEXT: v_mov_b32_e32 v3, v5
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
+; SI-NEXT: s_cbranch_execnz .LBB138_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[36:37]
+; SI-NEXT: v_readlane_b32 s7, v6, 1
+; SI-NEXT: v_readlane_b32 s6, v6, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9651,23 +11213,55 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[34:35], 0
+; VI-NEXT: .LBB138_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v1
+; VI-NEXT: v_mov_b32_e32 v6, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; VI-NEXT: s_cbranch_execnz .LBB138_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
+; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT: s_cbranch_execnz .LBB138_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9756,30 +11350,95 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_udec_wrap_i64_noret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB141_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6
+; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB141_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i64_noret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: .LBB141_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_cbranch_execnz .LBB141_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_noret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB141_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -9789,13 +11448,38 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB142_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6
+; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v7
+; SI-NEXT: v_mov_b32_e32 v10, v6
+; SI-NEXT: v_mov_b32_e32 v9, v5
+; SI-NEXT: v_mov_b32_e32 v8, v4
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: v_mov_b32_e32 v6, v8
+; SI-NEXT: v_mov_b32_e32 v7, v9
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB142_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -9804,17 +11488,57 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: .LBB142_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_cbranch_execnz .LBB142_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB142_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9825,32 +11549,103 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
; SI-LABEL: global_atomic_udec_wrap_i64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_mov_b32_e32 v4, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB143_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5]
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB143_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i64_ret:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: .LBB143_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, v5
+; VI-NEXT: v_mov_b32_e32 v6, v4
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
+; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_cbranch_execnz .LBB143_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, v4
+; VI-NEXT: v_mov_b32_e32 v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_ret:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB143_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -9860,34 +11655,103 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s6
-; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v5, v3
+; SI-NEXT: v_mov_b32_e32 v4, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s8, s10
+; SI-NEXT: s_mov_b32 s9, s10
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32
+; SI-NEXT: s_mov_b64 s[6:7], 0
+; SI-NEXT: .LBB144_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, v1
+; SI-NEXT: v_mov_b32_e32 v10, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5]
+; SI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc
+; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v8
+; SI-NEXT: v_mov_b32_e32 v1, v9
+; SI-NEXT: v_mov_b32_e32 v2, v10
+; SI-NEXT: v_mov_b32_e32 v3, v11
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: v_mov_b32_e32 v1, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; SI-NEXT: s_cbranch_execnz .LBB144_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[6:7]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: .LBB144_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
+; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8
+; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
+; VI-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_cbranch_execnz .LBB144_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[8:9]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32
+; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-NEXT: s_cbranch_execnz .LBB144_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9899,25 +11763,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
+; SI-NEXT: .LBB145_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3]
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB145_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9925,24 +11813,66 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_mov_b64 s[38:39], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: .LBB145_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; VI-NEXT: s_cbranch_execnz .LBB145_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[38:39]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB145_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret void
@@ -9953,23 +11883,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
+; SI-NEXT: .LBB146_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3]
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, v3
+; SI-NEXT: v_mov_b32_e32 v8, v2
+; SI-NEXT: v_mov_b32_e32 v7, v1
+; SI-NEXT: v_mov_b32_e32 v6, v0
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: v_mov_b32_e32 v2, v6
+; SI-NEXT: v_mov_b32_e32 v3, v7
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB146_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -9979,24 +11935,64 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
-; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s34
+; VI-NEXT: v_mov_b32_e32 v5, s35
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; VI-NEXT: s_mov_b64 s[38:39], 0
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: .LBB146_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
+; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, v1
+; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; VI-NEXT: v_mov_b32_e32 v2, v0
+; VI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; VI-NEXT: s_cbranch_execnz .LBB146_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[38:39]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB146_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -10008,25 +12004,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: s_mov_b32 s34, s7
-; SI-NEXT: s_mov_b32 s35, s6
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, s35
-; SI-NEXT: v_mov_b32_e32 v1, s34
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
+; SI-NEXT: .LBB147_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9]
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB147_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -10034,24 +12054,66 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_mov_b64 s[38:39], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: .LBB147_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; VI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; VI-NEXT: s_cbranch_execnz .LBB147_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[38:39]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB147_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
ret i64 %result
@@ -10062,23 +12124,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_writelane_b32 v2, s6, 0
-; SI-NEXT: v_writelane_b32 v2, s7, 1
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_writelane_b32 v10, s6, 0
+; SI-NEXT: v_writelane_b32 v10, s7, 1
+; SI-NEXT: s_mov_b32 s35, s7
+; SI-NEXT: s_mov_b32 s34, s6
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT: s_mov_b64 s[38:39], 0
+; SI-NEXT: v_mov_b32_e32 v4, s35
+; SI-NEXT: v_mov_b32_e32 v5, s34
+; SI-NEXT: .LBB148_1: ; %atomicrmw.start
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc
+; SI-NEXT: v_mov_b32_e32 v9, v1
+; SI-NEXT: v_mov_b32_e32 v8, v0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8
+; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9]
+; SI-NEXT: s_or_b64 vcc, vcc, s[36:37]
+; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; SI-NEXT: v_mov_b32_e32 v0, v6
+; SI-NEXT: v_mov_b32_e32 v1, v7
+; SI-NEXT: v_mov_b32_e32 v2, v8
+; SI-NEXT: v_mov_b32_e32 v3, v9
+; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_wbinvl1
-; SI-NEXT: v_readlane_b32 s7, v2, 1
-; SI-NEXT: v_readlane_b32 s6, v2, 0
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; SI-NEXT: s_cbranch_execnz .LBB148_1
+; SI-NEXT: ; %bb.2: ; %atomicrmw.end
+; SI-NEXT: s_or_b64 exec, exec, s[38:39]
+; SI-NEXT: v_readlane_b32 s7, v10, 1
+; SI-NEXT: v_readlane_b32 s6, v10, 0
; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; SI-NEXT: s_mov_b64 exec, s[34:35]
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -10089,23 +12177,63 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
; VI-NEXT: s_add_u32 s34, s4, 32
; VI-NEXT: s_addc_u32 s35, s5, 0
; VI-NEXT: v_mov_b32_e32 v2, s34
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v3, s35
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_mov_b64 s[38:39], 0
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_mov_b32_e32 v5, s6
+; VI-NEXT: .LBB148_1: ; %atomicrmw.start
+; VI-NEXT: ; =>This Inner Loop Header: Depth=1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v9, v1
+; VI-NEXT: v_mov_b32_e32 v8, v0
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
+; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
+; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
+; VI-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
+; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; VI-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; VI-NEXT: s_cbranch_execnz .LBB148_1
+; VI-NEXT: ; %bb.2: ; %atomicrmw.end
+; VI-NEXT: s_or_b64 exec, exec, s[38:39]
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, v0
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8]
+; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37]
+; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB148_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i64, ptr addrspace(1) %out, i64 4
%result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clause-limit-attr.mir b/llvm/test/CodeGen/AMDGPU/hard-clause-limit-attr.mir
index cfd4ea1..fb3f328 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clause-limit-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clause-limit-attr.mir
@@ -18,7 +18,7 @@ body: |
; GFX11-LABEL: name: long_clause_32
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 31
; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -96,7 +96,7 @@ body: |
; GFX11-LABEL: name: long_clause_7
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 6
; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -106,7 +106,7 @@ body: |
; GFX11-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 6
; GFX11-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
@@ -116,7 +116,7 @@ body: |
; GFX11-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 6
; GFX11-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
@@ -126,7 +126,7 @@ body: |
; GFX11-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 6
; GFX11-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
@@ -136,7 +136,7 @@ body: |
; GFX11-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 3
; GFX11-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clause-limit.mir b/llvm/test/CodeGen/AMDGPU/hard-clause-limit.mir
index 98221c2..c00e25c 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clause-limit.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clause-limit.mir
@@ -14,7 +14,7 @@ body: |
; GFX11-32-LABEL: name: long_clause
; GFX11-32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-32-NEXT: {{ $}}
- ; GFX11-32-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-32-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-32-NEXT: S_CLAUSE 31
; GFX11-32-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-32-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -53,7 +53,7 @@ body: |
; GFX11-16-LABEL: name: long_clause
; GFX11-16: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-16-NEXT: {{ $}}
- ; GFX11-16-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-16-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-16-NEXT: S_CLAUSE 15
; GFX11-16-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-16-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -72,7 +72,7 @@ body: |
; GFX11-16-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
; GFX11-16-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
; GFX11-16-NEXT: }
- ; GFX11-16-NEXT: BUNDLE implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-16-NEXT: BUNDLE implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-16-NEXT: S_CLAUSE 15
; GFX11-16-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
; GFX11-16-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
@@ -95,7 +95,7 @@ body: |
; GFX11-10-LABEL: name: long_clause
; GFX11-10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-10-NEXT: {{ $}}
- ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-10-NEXT: S_CLAUSE 9
; GFX11-10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -108,7 +108,7 @@ body: |
; GFX11-10-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
; GFX11-10-NEXT: }
- ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-10-NEXT: S_CLAUSE 9
; GFX11-10-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
@@ -121,7 +121,7 @@ body: |
; GFX11-10-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
; GFX11-10-NEXT: }
- ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-10-NEXT: S_CLAUSE 9
; GFX11-10-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
@@ -134,7 +134,7 @@ body: |
; GFX11-10-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
; GFX11-10-NEXT: }
- ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-10-NEXT: BUNDLE implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-10-NEXT: S_CLAUSE 1
; GFX11-10-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
; GFX11-10-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
@@ -143,56 +143,56 @@ body: |
; GFX11-4-LABEL: name: long_clause
; GFX11-4: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-4-NEXT: {{ $}}
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
; GFX11-4-NEXT: }
- ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-4-NEXT: BUNDLE implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-4-NEXT: S_CLAUSE 3
; GFX11-4-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
; GFX11-4-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
index 8007597..79480bc 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
@@ -1,6 +1,507 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12,GFX1200
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12,GFX1250
+
+---
+name: long_clause
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; GFX1200-LABEL: name: long_clause
+ ; GFX1200: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX1200-NEXT: S_CLAUSE 31
+ ; GFX1200-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+ ; GFX1200-NEXT: }
+ ; GFX1200-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr34, implicit-def $vgpr35, implicit-def $vgpr36, implicit-def $vgpr37, implicit-def $vgpr38, implicit-def $vgpr39, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX1200-NEXT: S_CLAUSE 31
+ ; GFX1200-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+ ; GFX1200-NEXT: }
+ ; GFX1200-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr66, implicit-def $vgpr67, implicit-def $vgpr68, implicit-def $vgpr69, implicit-def $vgpr70, implicit-def $vgpr71, implicit-def $vgpr72, implicit-def $vgpr73, implicit-def $vgpr74, implicit-def $vgpr75, implicit-def $vgpr76, implicit-def $vgpr77, implicit-def $vgpr78, implicit-def $vgpr79, implicit-def $vgpr80, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX1200-NEXT: S_CLAUSE 15
+ ; GFX1200-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+ ; GFX1200-NEXT: }
+ ;
+ ; GFX1250-LABEL: name: long_clause
+ ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit-def $vgpr33, implicit-def $vgpr34, implicit-def $vgpr35, implicit-def $vgpr36, implicit-def $vgpr37, implicit-def $vgpr38, implicit-def $vgpr39, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX1250-NEXT: S_CLAUSE 62
+ ; GFX1250-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66, implicit-def $vgpr67, implicit-def $vgpr68, implicit-def $vgpr69, implicit-def $vgpr70, implicit-def $vgpr71, implicit-def $vgpr72, implicit-def $vgpr73, implicit-def $vgpr74, implicit-def $vgpr75, implicit-def $vgpr76, implicit-def $vgpr77, implicit-def $vgpr78, implicit-def $vgpr79, implicit-def $vgpr80, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX1250-NEXT: S_CLAUSE 16
+ ; GFX1250-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
+ $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
+ $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec
+ $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec
+ $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, implicit $exec
+ $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, implicit $exec
+ $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, implicit $exec
+ $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, implicit $exec
+ $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, implicit $exec
+ $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, implicit $exec
+ $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, implicit $exec
+ $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, implicit $exec
+ $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, implicit $exec
+ $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, implicit $exec
+ $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, implicit $exec
+ $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, implicit $exec
+ $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, implicit $exec
+ $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, implicit $exec
+ $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, implicit $exec
+ $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, implicit $exec
+ $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, implicit $exec
+ $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, implicit $exec
+ $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, implicit $exec
+ $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, implicit $exec
+ $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, implicit $exec
+ $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, implicit $exec
+ $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, implicit $exec
+ $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, implicit $exec
+ $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, implicit $exec
+ $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, implicit $exec
+ $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
+ $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
+ $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
+ $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
+ $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, implicit $exec
+ $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, implicit $exec
+ $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, implicit $exec
+ $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, implicit $exec
+ $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, implicit $exec
+ $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, implicit $exec
+ $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, implicit $exec
+ $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, implicit $exec
+ $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, implicit $exec
+ $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, implicit $exec
+ $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, implicit $exec
+ $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, implicit $exec
+ $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, implicit $exec
+ $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, implicit $exec
+ $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, implicit $exec
+ $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, implicit $exec
+ $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, implicit $exec
+ $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, implicit $exec
+ $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, implicit $exec
+ $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, implicit $exec
+ $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, implicit $exec
+ $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, implicit $exec
+ $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, implicit $exec
+ $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, implicit $exec
+ $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, implicit $exec
+ $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, implicit $exec
+ $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, implicit $exec
+ $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
+ $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
+ $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
+ $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
+ $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
+ $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, implicit $exec
+ $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, implicit $exec
+ $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, implicit $exec
+ $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, implicit $exec
+ $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, implicit $exec
+ $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, implicit $exec
+ $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, implicit $exec
+ $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, implicit $exec
+ $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, implicit $exec
+ $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, implicit $exec
+ $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, implicit $exec
+ $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, implicit $exec
+ $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, implicit $exec
+ $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, implicit $exec
+...
+
+---
+name: kill
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr4
+ ; GFX12-LABEL: name: kill
+ ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX12-NEXT: S_CLAUSE 1
+ ; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GFX12-NEXT: KILL undef renamable $sgpr4
+ ; GFX12-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+ ; GFX12-NEXT: }
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ KILL undef renamable $sgpr4
+ $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+...
+
+---
+name: kill2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
+ ; GFX12-LABEL: name: kill2
+ ; GFX12: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX12-NEXT: S_CLAUSE 1
+ ; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GFX12-NEXT: KILL undef renamable $sgpr4
+ ; GFX12-NEXT: $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+ ; GFX12-NEXT: }
+ ; GFX12-NEXT: KILL undef renamable $sgpr5
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ KILL undef renamable $sgpr4
+ $sgpr3 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0
+ KILL undef renamable $sgpr5
+...
+
+---
+name: flat_load_atomic
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-LABEL: name: flat_load_atomic
+ ; GFX1200: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX1200-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+ ;
+ ; GFX1250-LABEL: name: flat_load_atomic
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr4, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr, implicit $vgpr2 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX1250-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+ ; GFX1250-NEXT: }
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_load_atomic
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-LABEL: name: global_load_atomic
+ ; GFX1200: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+ ;
+ ; GFX1250-LABEL: name: global_load_atomic
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr4, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+...
+
+---
+name: flat_global_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: flat_global_load
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: buffer_load_atomic
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1200-LABEL: name: buffer_load_atomic
+ ; GFX1200: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX1200-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+ ;
+ ; GFX1250-LABEL: name: buffer_load_atomic
+ ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $exec, implicit $vgpr0 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
+...
+
+---
+name: flat_load_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-LABEL: name: flat_load_store
+ ; GFX1200: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX1200-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+ ;
+ ; GFX1250-LABEL: name: flat_load_store
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr3, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr, implicit $vgpr2 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX1250-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+ ; GFX1250-NEXT: }
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_load_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-LABEL: name: global_load_store
+ ; GFX1200: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+ ;
+ ; GFX1250-LABEL: name: global_load_store
+ ; GFX1250: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr3, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
+...
+
+---
+name: buffer_load_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1200-LABEL: name: buffer_load_store
+ ; GFX1200: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+ ;
+ ; GFX1250-LABEL: name: buffer_load_store
+ ; GFX1250: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUNDLE implicit-def $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $exec, implicit $vgpr0 {
+ ; GFX1250-NEXT: S_CLAUSE 1
+ ; GFX1250-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+ ; GFX1250-NEXT: }
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+ BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+...
+
+---
+name: flat_load_global_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX12-LABEL: name: flat_load_global_load
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+
+---
+name: global_load_buffer_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+ ; GFX12-LABEL: name: global_load_buffer_store
+ ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GFX12-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
+...
---
name: flat_prefetch_flat_load
@@ -31,3 +532,106 @@ body: |
GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
...
+
+---
+name: async_load_async_store
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX12-LABEL: name: async_load_async_store
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: BUNDLE implicit-def $asynccnt, implicit $vgpr2, implicit $vgpr0_vgpr1, implicit $exec, implicit $asynccnt {
+ ; GFX12-NEXT: S_CLAUSE 1
+ ; GFX12-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ ; GFX12-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B32 $vgpr0_vgpr1, $vgpr2, 32, 0, implicit-def $asynccnt, implicit $exec, implicit internal $asynccnt
+ ; GFX12-NEXT: }
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ GLOBAL_STORE_ASYNC_FROM_LDS_B32 $vgpr0_vgpr1, $vgpr2, 32, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+...
+
+---
+name: async_load_ds_load_tr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX12-LABEL: name: async_load_ds_load_tr
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ ; GFX12-NEXT: $vgpr0_vgpr1 = DS_LOAD_TR8_B64 $vgpr2, 8, 0, implicit $exec
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ $vgpr0_vgpr1 = DS_LOAD_TR8_B64 $vgpr2, 8, 0, implicit $exec
+...
+
+---
+name: ds_load_trs_ds_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GFX12-LABEL: name: ds_load_trs_ds_load
+ ; GFX12: liveins: $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr4_vgpr5 = DS_LOAD_TR8_B64 $vgpr0, 0, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr0_vgpr1 = DS_LOAD_TR8_B64 $vgpr0, 8, 0, implicit $exec
+ ; GFX12-NEXT: $vgpr2_vgpr3 = DS_READ_B64_gfx9 $vgpr0, 16, 0, implicit $exec
+ $vgpr4_vgpr5 = DS_LOAD_TR8_B64 $vgpr0, 0, 0, implicit $exec
+ $vgpr0_vgpr1 = DS_LOAD_TR8_B64 $vgpr0, 8, 0, implicit $exec
+ $vgpr2_vgpr3 = DS_READ_B64_gfx9 $vgpr0, 16, 0, implicit $exec
+...
+
+# Make sure we do not clause DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 with anything
+---
+name: ds_atomic_async_barrier_arrive_b64_ds_read
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX12-LABEL: name: ds_atomic_async_barrier_arrive_b64_ds_read
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+ ; GFX12-NEXT: DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr0, 16, 0, implicit $exec
+ $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
+ DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+ $vgpr3 = DS_READ_B32_gfx9 $vgpr0, 16, 0, implicit $exec
+...
+
+---
+name: ds_atomic_async_barrier_arrive_b64_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GFX12-LABEL: name: ds_atomic_async_barrier_arrive_b64_flat_load
+ ; GFX12: liveins: $vgpr0, $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+ ; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 16, 0, implicit $exec, implicit $flat_scr
+ $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+ $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 16, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: global_load_switching_scope
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GFX12-LABEL: name: global_load_switching_scope
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+ ; GFX12-NEXT: S_CLAUSE 1
+ ; GFX12-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 24, implicit $exec, implicit $flat_scr
+ ; GFX12-NEXT: }
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 24, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 0829cab..7e1055b 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
; CHECK-LABEL: name: mimg_nsa_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 243a8456..9689dda 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
; CHECK-LABEL: name: mimg
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
; CHECK-LABEL: name: mimg_mixed
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr20_vgpr21_vgpr22, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr20_vgpr21, implicit-def $vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
; CHECK-NEXT: $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir
new file mode 100644
index 0000000..db4b946
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir
@@ -0,0 +1,38 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GCN
+
+---
+name: async_load_flat_monitor_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-LABEL: name: async_load_flat_monitor_load
+ ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ ; GCN-NEXT: $vgpr0 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+ GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+ $vgpr0 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: flat_monitor_loads_flat_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GCN-LABEL: name: flat_monitor_loads_flat_load
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr4, implicit-def $vgpr3, implicit-def $vgpr0, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+ ; GCN-NEXT: S_CLAUSE 2
+ ; GCN-NEXT: $vgpr4 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr3 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-NEXT: }
+ $vgpr4 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr3 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index 44b988a..1341a59 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -39,7 +39,7 @@ body: |
; CHECK-LABEL: name: nop2
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK-NEXT: S_NOP 2
@@ -49,7 +49,7 @@ body: |
; GFX11-LABEL: name: nop2
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; GFX11-NEXT: S_CLAUSE 2
; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX11-NEXT: S_NOP 2
@@ -59,7 +59,7 @@ body: |
; GFX12-LABEL: name: nop2
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; GFX12-NEXT: S_CLAUSE 2
; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX12-NEXT: S_NOP 2
@@ -79,7 +79,7 @@ body: |
; CHECK-LABEL: name: nop3
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; CHECK-NEXT: S_CLAUSE 2
; CHECK-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK-NEXT: S_NOP 2
@@ -90,7 +90,7 @@ body: |
; GFX11-LABEL: name: nop3
; GFX11: liveins: $sgpr0_sgpr1
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; GFX11-NEXT: S_CLAUSE 2
; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX11-NEXT: S_NOP 2
@@ -101,7 +101,7 @@ body: |
; GFX12-LABEL: name: nop3
; GFX12: liveins: $sgpr0_sgpr1
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1 {
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1 {
; GFX12-NEXT: S_CLAUSE 2
; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX12-NEXT: S_NOP 2
@@ -123,7 +123,7 @@ body: |
; CHECK-LABEL: name: long_clause
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit-def $vgpr33, implicit-def $vgpr34, implicit-def $vgpr35, implicit-def $vgpr36, implicit-def $vgpr37, implicit-def $vgpr38, implicit-def $vgpr39, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; CHECK-NEXT: S_CLAUSE 62
; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -189,7 +189,7 @@ body: |
; CHECK-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
; CHECK-NEXT: }
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66, implicit-def $vgpr67, implicit-def $vgpr68, implicit-def $vgpr69, implicit-def $vgpr70, implicit-def $vgpr71, implicit-def $vgpr72, implicit-def $vgpr73, implicit-def $vgpr74, implicit-def $vgpr75, implicit-def $vgpr76, implicit-def $vgpr77, implicit-def $vgpr78, implicit-def $vgpr79, implicit-def $vgpr80, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; CHECK-NEXT: S_CLAUSE 16
; CHECK-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
@@ -213,7 +213,7 @@ body: |
; GFX11-LABEL: name: long_clause
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 31
; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -248,7 +248,7 @@ body: |
; GFX11-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr34, implicit-def $vgpr35, implicit-def $vgpr36, implicit-def $vgpr37, implicit-def $vgpr38, implicit-def $vgpr39, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 31
; GFX11-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
@@ -283,7 +283,7 @@ body: |
; GFX11-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
; GFX11-NEXT: }
- ; GFX11-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX11-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr66, implicit-def $vgpr67, implicit-def $vgpr68, implicit-def $vgpr69, implicit-def $vgpr70, implicit-def $vgpr71, implicit-def $vgpr72, implicit-def $vgpr73, implicit-def $vgpr74, implicit-def $vgpr75, implicit-def $vgpr76, implicit-def $vgpr77, implicit-def $vgpr78, implicit-def $vgpr79, implicit-def $vgpr80, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX11-NEXT: S_CLAUSE 15
; GFX11-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
; GFX11-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
@@ -306,7 +306,7 @@ body: |
; GFX12-LABEL: name: long_clause
; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31, implicit-def $vgpr32, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX12-NEXT: S_CLAUSE 31
; GFX12-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec
@@ -341,7 +341,7 @@ body: |
; GFX12-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, implicit $exec
; GFX12-NEXT: }
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr33, implicit-def $vgpr34, implicit-def $vgpr35, implicit-def $vgpr36, implicit-def $vgpr37, implicit-def $vgpr38, implicit-def $vgpr39, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX12-NEXT: S_CLAUSE 31
; GFX12-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, implicit $exec
@@ -376,7 +376,7 @@ body: |
; GFX12-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec
; GFX12-NEXT: }
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr65, implicit-def $vgpr66, implicit-def $vgpr67, implicit-def $vgpr68, implicit-def $vgpr69, implicit-def $vgpr70, implicit-def $vgpr71, implicit-def $vgpr72, implicit-def $vgpr73, implicit-def $vgpr74, implicit-def $vgpr75, implicit-def $vgpr76, implicit-def $vgpr77, implicit-def $vgpr78, implicit-def $vgpr79, implicit-def $vgpr80, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec {
; GFX12-NEXT: S_CLAUSE 15
; GFX12-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, implicit $exec
@@ -486,7 +486,7 @@ body: |
; CHECK-LABEL: name: kill
; CHECK: liveins: $sgpr0_sgpr1, $sgpr4
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK-NEXT: KILL undef renamable $sgpr4
@@ -496,7 +496,7 @@ body: |
; GFX11-LABEL: name: kill
; GFX11: liveins: $sgpr0_sgpr1, $sgpr4
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; GFX11-NEXT: S_CLAUSE 1
; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX11-NEXT: KILL undef renamable $sgpr4
@@ -506,7 +506,7 @@ body: |
; GFX12-LABEL: name: kill
; GFX12: liveins: $sgpr0_sgpr1, $sgpr4
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; GFX12-NEXT: S_CLAUSE 1
; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX12-NEXT: KILL undef renamable $sgpr4
@@ -526,7 +526,7 @@ body: |
; CHECK-LABEL: name: kill2
; CHECK: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; CHECK-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; CHECK-NEXT: KILL undef renamable $sgpr4
@@ -537,7 +537,7 @@ body: |
; GFX11-LABEL: name: kill2
; GFX11: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX11-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; GFX11-NEXT: S_CLAUSE 1
; GFX11-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX11-NEXT: KILL undef renamable $sgpr4
@@ -548,7 +548,7 @@ body: |
; GFX12-LABEL: name: kill2
; GFX12: liveins: $sgpr0_sgpr1, $sgpr4, $sgpr5
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
+ ; GFX12-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit $sgpr0_sgpr1, implicit undef $sgpr4 {
; GFX12-NEXT: S_CLAUSE 1
; GFX12-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
; GFX12-NEXT: KILL undef renamable $sgpr4
@@ -570,7 +570,7 @@ body: |
; CHECK-LABEL: name: flat_load_atomic
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr, implicit $vgpr2 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr4, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr, implicit $vgpr2 {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; CHECK-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec, implicit $flat_scr
@@ -600,7 +600,7 @@ body: |
; CHECK-LABEL: name: global_load_atomic
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr3, implicit-def $vgpr4, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr2 {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr4 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 4, 0, implicit $exec
@@ -657,7 +657,7 @@ body: |
; CHECK-LABEL: name: buffer_load_atomic
; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $exec, implicit $vgpr0 {
+ ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $exec, implicit $vgpr0 {
; CHECK-NEXT: S_CLAUSE 1
; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = BUFFER_ATOMIC_ADD_OFFSET_RTN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 4, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-getreg-waitalu.mir b/llvm/test/CodeGen/AMDGPU/hazard-getreg-waitalu.mir
new file mode 100644
index 0000000..213fba9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-getreg-waitalu.mir
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck --check-prefix=GCN %s
+
+---
+name: s_getreg_mode
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_mode
+ ; GCN: $sgpr0 = S_GETREG_B32 1, implicit $mode
+ $sgpr0 = S_GETREG_B32 1, implicit $mode
+...
+
+---
+name: s_getreg_status
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_status
+ ; GCN: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 2, implicit $mode
+ $sgpr0 = S_GETREG_B32 2, implicit $mode
+...
+
+---
+name: s_getreg_status_masked
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_status_masked
+ ; GCN: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 66, implicit $mode
+ $sgpr0 = S_GETREG_B32 66, implicit $mode
+...
+
+---
+name: s_getreg_state_priv
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_state_priv
+ ; GCN: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 4, implicit $mode
+ $sgpr0 = S_GETREG_B32 4, implicit $mode
+...
+
+---
+name: s_getreg_excp_flag_priv
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_excp_flag_priv
+ ; GCN: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 17, implicit $mode
+ $sgpr0 = S_GETREG_B32 17, implicit $mode
+...
+
+---
+name: s_getreg_excp_flag_user
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_excp_flag_user
+ ; GCN: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 18, implicit $mode
+ $sgpr0 = S_GETREG_B32 18, implicit $mode
+...
+
+---
+name: s_getreg_status_in_bundle
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_status_in_bundle
+ ; GCN: BUNDLE {
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 2, implicit $mode
+ ; GCN-NEXT: }
+ BUNDLE {
+ S_NOP 0
+ $sgpr0 = S_GETREG_B32 2, implicit $mode
+ }
+...
+
+---
+name: s_getreg_status_top_of_bundle
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_getreg_status_top_of_bundle
+ ; GCN: BUNDLE {
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: $sgpr0 = S_GETREG_B32 2, implicit $mode
+ ; GCN-NEXT: }
+ BUNDLE {
+ $sgpr0 = S_GETREG_B32 2, implicit $mode
+ }
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir
new file mode 100644
index 0000000..1704785
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir
@@ -0,0 +1,549 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: ds_atomic_async_barrier_arrive_b64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; GCN-LABEL: name: ds_atomic_async_barrier_arrive_b64
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65507
+ ; GCN-NEXT: DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65507
+ DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base
+ ; GCN: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s103_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base
+ ; GCN: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s102_s103_read_flat_scr_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_s103_read_flat_scr_base
+ ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+ $sgpr102_sgpr103 = S_MOV_B64 0
+ $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base
+...
+
+---
+name: write_s102_getreg_flat_scr_base_lo
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_getreg_flat_scr_base_lo
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 20, implicit $mode
+ $sgpr102 = S_MOV_B32 0
+ $sgpr1 = S_GETREG_B32 20, implicit $mode
+...
+
+---
+name: write_s103_getreg_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_getreg_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode
+ $sgpr103 = S_MOV_B32 0
+ $sgpr1 = S_GETREG_B32 21, implicit $mode
+...
+
+---
+name: write_s102_s103_getreg_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: write_s102_s103_getreg_flat_scr_base_hi
+ ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode
+ $sgpr102_sgpr103 = S_MOV_B64 0
+ $sgpr1 = S_GETREG_B32 21, implicit $mode
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_9_salu_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_9_salu_valu
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $sgpr10 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi_9_salu_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_9_salu_valu
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0
+ ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: $sgpr10 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr103 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = S_MOV_B64 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec
+ $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ ; NOP does not count because it does not write SGPRs
+ S_NOP 0
+ ; DS_READ_B32 does not count because it is not SALU or VALU
+ $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ ; S_LOAD_DWORDX2_IMM does not count because it is not SALU
+ $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0
+ $sgpr10 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_hi_no_hazard
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_hi_no_hazard
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_expired_by_wait0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait0
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 0
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 0
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 61950
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 61951
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: S_NOP 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ S_WAITCNT_DEPCTR 65534
+ S_NOP 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+ ; GCN-LABEL: name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi
+ ; GCN: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr103 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+ $sgpr102 = S_MOV_B32 0
+ $sgpr103 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec
+...
+
+---
+name: write_s102_read_flat_scr_base_lo_cross_blocks
+tracksRegLiveness: true
+body: |
+ ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_cross_blocks
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0, $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8 = S_MOV_B32 0
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr102 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr3 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr5 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr6 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr7 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr8 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr9 = S_MOV_B32 0
+ ; GCN-NEXT: S_BRANCH %bb.2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+ bb.0:
+ liveins: $vgpr0, $sgpr0
+ $sgpr102 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 0
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ $sgpr8 = S_MOV_B32 0
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $vgpr0
+ $sgpr102 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 0
+ $sgpr4 = S_MOV_B32 0
+ $sgpr5 = S_MOV_B32 0
+ $sgpr6 = S_MOV_B32 0
+ $sgpr7 = S_MOV_B32 0
+ $sgpr8 = S_MOV_B32 0
+ $sgpr9 = S_MOV_B32 0
+ S_BRANCH %bb.2
+
+ bb.2:
+ liveins: $vgpr0
+ $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec
+...
+
+---
+name: s_setreg_b32_hwreg_mode
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+ ; GCN-LABEL: name: s_setreg_b32_hwreg_mode
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
+ S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode
+...
+
+---
+name: s_setreg_b32_mode
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+ ; GCN-LABEL: name: s_setreg_b32_mode
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: S_SETREG_B32_mode $sgpr0, 1, implicit-def $mode, implicit $mode
+ S_SETREG_B32_mode $sgpr0, 1, implicit-def $mode, implicit $mode
+...
+
+---
+name: s_setreg_imm32_b32_hwreg_mode
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_setreg_imm32_b32_hwreg_mode
+ ; GCN: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: S_SETREG_IMM32_B32 1, 1, implicit-def $mode, implicit $mode
+ S_SETREG_IMM32_B32 1, 1, implicit-def $mode, implicit $mode
+...
+
+---
+name: s_setreg_imm32_b32_mode
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: s_setreg_imm32_b32_mode
+ ; GCN: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: V_NOP_e32 implicit $exec
+ ; GCN-NEXT: S_SETREG_IMM32_B32_mode 1, 1, implicit-def $mode, implicit $mode
+ S_SETREG_IMM32_B32_mode 1, 1, implicit-def $mode, implicit $mode
+...
diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
index b68d74b..704b325 100644
--- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
@@ -48,7 +48,7 @@ define i32 @global_agent_release_idempotent_or(ptr addrspace(1) %in) {
; GFX942-NEXT: s_setpc_b64 s[30:31]
; OPT-LABEL: @global_agent_release_idempotent_or(
; OPT-NEXT: entry:
-; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4
; OPT-NEXT: ret i32 [[VAL]]
;
entry:
@@ -56,6 +56,42 @@ entry:
ret i32 %val
}
+define i32 @global_agent_release_idempotent_or_no_remote(ptr addrspace(1) %in) {
+; GFX942-LABEL: global_agent_release_idempotent_or_no_remote:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_release_idempotent_or_no_remote(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.remote.memory [[META0:![0-9]+]]
+; OPT-NEXT: ret i32 [[VAL]]
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.remote.memory !0
+ ret i32 %val
+}
+
+define i32 @global_agent_release_idempotent_or_no_fine_grained(ptr addrspace(1) %in) {
+; GFX942-LABEL: global_agent_release_idempotent_or_no_fine_grained:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: buffer_wbl2 sc1
+; GFX942-NEXT: global_atomic_or v0, v[0:1], v2, off sc0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @global_agent_release_idempotent_or_no_fine_grained(
+; OPT-NEXT: entry:
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; OPT-NEXT: ret i32 [[VAL]]
+entry:
+ %val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") release, align 4, !amdgpu.no.fine.grained.memory !0
+ ret i32 %val
+}
+
define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) {
; GFX942-LABEL: global_agent_acquire_release_idempotent_or:
; GFX942: ; %bb.0: ; %entry
@@ -68,7 +104,7 @@ define i32 @global_agent_acquire_release_idempotent_or(ptr addrspace(1) %in) {
; GFX942-NEXT: s_setpc_b64 s[30:31]
; OPT-LABEL: @global_agent_acquire_release_idempotent_or(
; OPT-NEXT: entry:
-; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4
; OPT-NEXT: ret i32 [[VAL]]
;
entry:
@@ -88,9 +124,8 @@ define i32 @global_agent_acquire_release_idempotent_or__no_fine_grained(ptr addr
; GFX942-NEXT: s_setpc_b64 s[30:31]
; OPT-LABEL: @global_agent_acquire_release_idempotent_or__no_fine_grained(
; OPT-NEXT: entry:
-; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; OPT-NEXT: ret i32 [[VAL]]
-;
entry:
%val = atomicrmw or ptr addrspace(1) %in, i32 0 syncscope("agent-one-as") acq_rel, align 4, !amdgpu.no.fine.grained.memory !0
ret i32 %val
@@ -108,7 +143,7 @@ define i32 @global_agent_seq_cst_idempotent_or(ptr addrspace(1) %in) {
; GFX942-NEXT: s_setpc_b64 s[30:31]
; OPT-LABEL: @global_agent_seq_cst_idempotent_or(
; OPT-NEXT: entry:
-; OPT-NEXT: [[VAL:%.*]] = atomicrmw or ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(1) [[IN:%.*]], i32 0 syncscope("agent-one-as") seq_cst, align 4
; OPT-NEXT: ret i32 [[VAL]]
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 7ebd692..305461e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1693,12 +1693,11 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
@@ -2724,32 +2723,32 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
-; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h
-; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index 58cfd40..2139000 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
; Use a 64-bit value with lo bits that can be represented as an inline constant
define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
@@ -25,6 +26,17 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: i64_imm_inline_lo:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 5
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x12345678
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
entry:
store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
ret void
@@ -53,6 +65,17 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: i64_imm_inline_hi:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x12345678
+; GFX942-NEXT: v_mov_b32_e32 v1, 5
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
entry:
store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
ret void
@@ -80,6 +103,17 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_imm_neg_0.0_i64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store i64 -9223372036854775808, ptr addrspace(1) %out
ret void
}
@@ -104,6 +138,16 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_neg_0.0_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_bfrev_b32_e32 v0, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store i32 -2147483648, ptr addrspace(1) %out
ret void
}
@@ -128,6 +172,16 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 0.0, ptr addrspace(1) %out
ret void
}
@@ -152,6 +206,16 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_imm_neg_0.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_bfrev_b32_e32 v0, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float -0.0, ptr addrspace(1) %out
ret void
}
@@ -176,6 +240,16 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.5_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0.5
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 0.5, ptr addrspace(1) %out
ret void
}
@@ -200,6 +274,16 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_0.5_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -0.5
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float -0.5, ptr addrspace(1) %out
ret void
}
@@ -224,6 +308,16 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_1.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 1.0, ptr addrspace(1) %out
ret void
}
@@ -248,6 +342,16 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_1.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float -1.0, ptr addrspace(1) %out
ret void
}
@@ -272,6 +376,16 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_2.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 2.0, ptr addrspace(1) %out
ret void
}
@@ -296,6 +410,16 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_2.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -2.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float -2.0, ptr addrspace(1) %out
ret void
}
@@ -320,6 +444,16 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_4.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 4.0, ptr addrspace(1) %out
ret void
}
@@ -344,6 +478,16 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_4.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -4.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float -4.0, ptr addrspace(1) %out
ret void
}
@@ -368,6 +512,16 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_inv_2pi_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0.15915494
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 0x3FC45F3060000000, ptr addrspace(1) %out
ret void
}
@@ -392,6 +546,16 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_inv_2pi_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0xbe22f983
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 0xBFC45F3060000000, ptr addrspace(1) %out
ret void
}
@@ -416,6 +580,16 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x45800000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store float 4096.0, ptr addrspace(1) %out
ret void
}
@@ -442,6 +616,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %
; VI-NEXT: v_add_f32_e64 v0, s6, 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0.0
store float %y, ptr addrspace(1) %out
ret void
@@ -469,6 +654,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %
; VI-NEXT: v_add_f32_e64 v0, s6, 0.5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.5_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 0.5
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0.5
store float %y, ptr addrspace(1) %out
ret void
@@ -496,6 +692,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo
; VI-NEXT: v_add_f32_e64 v0, s6, -0.5
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_0.5_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, -0.5
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, -0.5
store float %y, ptr addrspace(1) %out
ret void
@@ -523,6 +730,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %
; VI-NEXT: v_add_f32_e64 v0, s6, 1.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 1.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 1.0
store float %y, ptr addrspace(1) %out
ret void
@@ -550,6 +768,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo
; VI-NEXT: v_add_f32_e64 v0, s6, -1.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, -1.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, -1.0
store float %y, ptr addrspace(1) %out
ret void
@@ -577,6 +806,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %
; VI-NEXT: v_add_f32_e64 v0, s6, 2.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 2.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 2.0
store float %y, ptr addrspace(1) %out
ret void
@@ -604,6 +844,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo
; VI-NEXT: v_add_f32_e64 v0, s6, -2.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, -2.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, -2.0
store float %y, ptr addrspace(1) %out
ret void
@@ -631,6 +882,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %
; VI-NEXT: v_add_f32_e64 v0, s6, 4.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_4.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 4.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 4.0
store float %y, ptr addrspace(1) %out
ret void
@@ -658,6 +920,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo
; VI-NEXT: v_add_f32_e64 v0, s6, -4.0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_4.0_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, -4.0
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, -4.0
store float %y, ptr addrspace(1) %out
ret void
@@ -699,6 +972,24 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out,
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: commute_add_inline_imm_0.5_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s7, 0xf000
+; GFX942-NEXT: s_mov_b32 s6, -1
+; GFX942-NEXT: s_mov_b32 s10, s6
+; GFX942-NEXT: s_mov_b32 s11, s7
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_mov_b32 s8, s2
+; GFX942-NEXT: s_mov_b32 s9, s3
+; GFX942-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX942-NEXT: s_mov_b32 s4, s0
+; GFX942-NEXT: s_mov_b32 s5, s1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_add_f32_e32 v0, 0.5, v0
+; GFX942-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX942-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 0.5
store float %y, ptr addrspace(1) %out
@@ -741,6 +1032,24 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: commute_add_literal_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s7, 0xf000
+; GFX942-NEXT: s_mov_b32 s6, -1
+; GFX942-NEXT: s_mov_b32 s10, s6
+; GFX942-NEXT: s_mov_b32 s11, s7
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_mov_b32 s8, s2
+; GFX942-NEXT: s_mov_b32 s9, s3
+; GFX942-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX942-NEXT: s_mov_b32 s4, s0
+; GFX942-NEXT: s_mov_b32 s5, s1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_add_f32_e32 v0, 0x44800000, v0
+; GFX942-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX942-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 1024.0
store float %y, ptr addrspace(1) %out
@@ -769,6 +1078,17 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x)
; VI-NEXT: v_add_f32_e64 v0, s6, 1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 1
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0x36a0000000000000
store float %y, ptr addrspace(1) %out
ret void
@@ -796,6 +1116,17 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x)
; VI-NEXT: v_add_f32_e64 v0, s6, 2
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 2
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0x36b0000000000000
store float %y, ptr addrspace(1) %out
ret void
@@ -823,6 +1154,17 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x
; VI-NEXT: v_add_f32_e64 v0, s6, 16
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_16_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 16
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0x36e0000000000000
store float %y, ptr addrspace(1) %out
ret void
@@ -852,6 +1194,18 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_i32 s4, s6, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -1
%ybc = bitcast i32 %y to float
@@ -883,6 +1237,18 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_i32 s4, s6, -2
+; GFX942-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -2
%ybc = bitcast i32 %y to float
@@ -914,6 +1280,18 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_16_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_i32 s4, s6, -16
+; GFX942-NEXT: v_mov_b32_e32 v0, s4
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -16
%ybc = bitcast i32 %y to float
@@ -943,6 +1321,17 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x
; VI-NEXT: v_add_f32_e64 v0, s6, 63
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_63_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 63
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0x36ff800000000000
store float %y, ptr addrspace(1) %out
ret void
@@ -970,6 +1359,17 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x
; VI-NEXT: v_add_f32_e64 v0, s6, 64
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_64_f32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f32_e64 v0, s6, 64
+; GFX942-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd float %x, 0x3700000000000000
store float %y, ptr addrspace(1) %out
ret void
@@ -999,6 +1399,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1028,6 +1439,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.5_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0.5
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0.5
store double %y, ptr addrspace(1) %out
ret void
@@ -1057,6 +1479,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_0.5_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -0.5
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, -0.5
store double %y, ptr addrspace(1) %out
ret void
@@ -1086,6 +1519,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 1.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 1.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1115,6 +1559,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -1.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, -1.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1144,6 +1599,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 2.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 2.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1173,6 +1639,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -2.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, -2.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1202,6 +1679,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_4.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 4.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 4.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1231,6 +1719,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_4.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], -4.0
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, -4.0
store double %y, ptr addrspace(1) %out
ret void
@@ -1262,6 +1761,17 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_inv_2pi_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 0.15915494309189532
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x3fc45f306dc9c882
store double %y, ptr addrspace(1) %out
ret void
@@ -1295,6 +1805,19 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_m_inv_2pi_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0xbfc45f306dc9c882
store double %y, ptr addrspace(1) %out
ret void
@@ -1324,6 +1847,17 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 1
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000001
store double %y, ptr addrspace(1) %out
ret void
@@ -1353,6 +1887,17 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 2
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000002
store double %y, ptr addrspace(1) %out
ret void
@@ -1382,6 +1927,17 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_16_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 16
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000010
store double %y, ptr addrspace(1) %out
ret void
@@ -1409,6 +1965,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, -1
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0xffffffffffffffff
store double %y, ptr addrspace(1) %out
ret void
@@ -1436,6 +2003,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -2
+; GFX942-NEXT: v_mov_b32_e32 v1, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffffe
store double %y, ptr addrspace(1) %out
ret void
@@ -1463,6 +2041,17 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_16_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, -16
+; GFX942-NEXT: v_mov_b32_e32 v1, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffff0
store double %y, ptr addrspace(1) %out
ret void
@@ -1492,6 +2081,17 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_63_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 63
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x000000000000003F
store double %y, ptr addrspace(1) %out
ret void
@@ -1521,6 +2121,17 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_64_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_add_f64 v[0:1], s[6:7], 64
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000040
store double %y, ptr addrspace(1) %out
ret void
@@ -1548,6 +2159,17 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 0.0, ptr addrspace(1) %out
ret void
}
@@ -1574,6 +2196,17 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_neg_0.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double -0.0, ptr addrspace(1) %out
ret void
}
@@ -1600,6 +2233,17 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.5_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fe00000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 0.5, ptr addrspace(1) %out
ret void
}
@@ -1626,6 +2270,17 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_0.5_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfe00000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double -0.5, ptr addrspace(1) %out
ret void
}
@@ -1652,6 +2307,17 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_1.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 1.0, ptr addrspace(1) %out
ret void
}
@@ -1678,6 +2344,17 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_1.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0xbff00000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double -1.0, ptr addrspace(1) %out
ret void
}
@@ -1704,6 +2381,17 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_2.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 2.0, ptr addrspace(1) %out
ret void
}
@@ -1730,6 +2418,17 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_2.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, -2.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double -2.0, ptr addrspace(1) %out
ret void
}
@@ -1756,6 +2455,17 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_4.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 4.0, ptr addrspace(1) %out
ret void
}
@@ -1782,6 +2492,17 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_4.0_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0xc0100000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double -4.0, ptr addrspace(1) %out
ret void
}
@@ -1808,6 +2529,17 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inv_2pi_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fc45f30
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
ret void
}
@@ -1834,6 +2566,17 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out)
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_inv_2pi_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
ret void
}
@@ -1860,6 +2603,17 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_mov_b32 s3, 0xf000
+; GFX942-NEXT: s_mov_b32 s2, -1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x40b00000
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT: s_endpgm
store double 4096.0, ptr addrspace(1) %out
ret void
}
@@ -1871,6 +2625,13 @@ define amdgpu_vs void @literal_folding(float %arg) {
; GCN-NEXT: v_mul_f32_e32 v0, 0xbf4353f8, v0
; GCN-NEXT: exp pos0 v1, v1, v0, v0 done
; GCN-NEXT: s_endpgm
+;
+; GFX942-LABEL: literal_folding:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: v_mul_f32_e32 v1, 0x3f4353f8, v0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0xbf4353f8, v0
+; GFX942-NEXT: exp pos0 v1, v1, v0, v0 done
+; GFX942-NEXT: s_endpgm
main_body:
%tmp = fmul float %arg, 0x3FE86A7F00000000
%tmp1 = fmul float %arg, 0xBFE86A7F00000000
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index 676773a..91b9773 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -19,16 +19,27 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_neg_0.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_neg_0.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_neg_0.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_neg_0.0_i16:
; VI: ; %bb.0:
@@ -66,15 +77,25 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_0.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_0.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 ; encoding: [0x80,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_0.0_f16:
; VI: ; %bb.0:
@@ -110,15 +131,25 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_imm_neg_0.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_imm_neg_0.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_imm_neg_0.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_imm_neg_0.0_f16:
; VI: ; %bb.0:
@@ -154,15 +185,25 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_0.5_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_0.5_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3800 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x38,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.5_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_0.5_f16:
; VI: ; %bb.0:
@@ -198,15 +239,25 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_m_0.5_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_0.5_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xb800 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xb8,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_0.5_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_m_0.5_f16:
; VI: ; %bb.0:
@@ -242,15 +293,25 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_1.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_1.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x3c,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_1.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_1.0_f16:
; VI: ; %bb.0:
@@ -286,15 +347,25 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_m_1.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_1.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xbc00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xbc,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_1.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_m_1.0_f16:
; VI: ; %bb.0:
@@ -330,15 +401,25 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_2.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_2.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x40,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_2.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_2.0_f16:
; VI: ; %bb.0:
@@ -374,15 +455,25 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_m_2.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_2.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xc000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xc0,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_2.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_m_2.0_f16:
; VI: ; %bb.0:
@@ -418,15 +509,25 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_4.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_4.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x4400 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x44,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_4.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_4.0_f16:
; VI: ; %bb.0:
@@ -462,15 +563,25 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_m_4.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_4.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xc400 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xc4,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_4.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_m_4.0_f16:
; VI: ; %bb.0:
@@ -506,15 +617,25 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_inv_2pi_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_inv_2pi_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3118 ; encoding: [0xff,0x38,0x00,0x7e,0x18,0x31,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_inv_2pi_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_inv_2pi_f16:
; VI: ; %bb.0:
@@ -550,15 +671,25 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_inv_2pi_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0xb118 ; encoding: [0xff,0x38,0x00,0x7e,0x18,0xb1,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_inv_2pi_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f16:
; VI: ; %bb.0:
@@ -594,15 +725,25 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) {
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: store_literal_imm_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_literal_imm_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x6c00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x6c,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_literal_imm_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: store_literal_imm_f16:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index c48613b..2a693e1 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -276,23 +276,23 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
;.
; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 17a5f52..f5d7bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -4244,38 +4244,22 @@ define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addr
; NOOPT-NEXT: v_mov_b32_e32 v10, v29
; NOOPT-NEXT: v_mov_b32_e32 v11, v28
; NOOPT-NEXT: v_mov_b32_e32 v15, v27
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v16, v11
; NOOPT-NEXT: v_mov_b32_e32 v17, v10
; NOOPT-NEXT: v_mov_b32_e32 v18, v9
; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v9, v14
; NOOPT-NEXT: v_mov_b32_e32 v10, v13
; NOOPT-NEXT: v_mov_b32_e32 v11, v12
; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_mov_b32_e32 v8, v3
; NOOPT-NEXT: v_mov_b32_e32 v9, v2
; NOOPT-NEXT: v_mov_b32_e32 v10, v1
; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v6
; NOOPT-NEXT: v_mov_b32_e32 v2, v5
@@ -4733,38 +4717,22 @@ define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, p
; NOOPT-NEXT: v_mov_b32_e32 v10, v29
; NOOPT-NEXT: v_mov_b32_e32 v11, v28
; NOOPT-NEXT: v_mov_b32_e32 v15, v27
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v16, v11
; NOOPT-NEXT: v_mov_b32_e32 v17, v10
; NOOPT-NEXT: v_mov_b32_e32 v18, v9
; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v9, v14
; NOOPT-NEXT: v_mov_b32_e32 v10, v13
; NOOPT-NEXT: v_mov_b32_e32 v11, v12
; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_mov_b32_e32 v8, v3
; NOOPT-NEXT: v_mov_b32_e32 v9, v2
; NOOPT-NEXT: v_mov_b32_e32 v10, v1
; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v6
; NOOPT-NEXT: v_mov_b32_e32 v2, v5
@@ -5061,7 +5029,6 @@ define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1
; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s4, v0
; NOOPT-NEXT: s_mov_b32 s4, 0
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: v_mov_b32_e32 v2, 0
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v2
@@ -5829,7 +5796,6 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0
; NOOPT-NEXT: s_mov_b32 s20, 0
-; NOOPT-NEXT: ; implicit-def: $sgpr20
; NOOPT-NEXT: v_mov_b32_e32 v2, 0
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v2
@@ -6159,40 +6125,24 @@ define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1)
; NOOPT-NEXT: v_mov_b32_e32 v11, v30
; NOOPT-NEXT: v_mov_b32_e32 v12, v29
; NOOPT-NEXT: v_mov_b32_e32 v16, v28
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v17, v12
; NOOPT-NEXT: v_mov_b32_e32 v18, v11
; NOOPT-NEXT: v_mov_b32_e32 v19, v10
; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48
; NOOPT-NEXT: s_waitcnt vmcnt(0)
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v10, v15
; NOOPT-NEXT: v_mov_b32_e32 v11, v14
; NOOPT-NEXT: v_mov_b32_e32 v12, v13
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:32
; NOOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v9, v4
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
; NOOPT-NEXT: v_mov_b32_e32 v11, v2
; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
; NOOPT-NEXT: s_waitcnt vmcnt(0)
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v2, v7
; NOOPT-NEXT: v_mov_b32_e32 v3, v6
@@ -7278,7 +7228,6 @@ define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
; NOOPT-NEXT: ; implicit-def: $sgpr2
; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; NOOPT-NEXT: s_mov_b32 s0, 1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT: s_cbranch_vccnz .LBB19_4
@@ -7519,7 +7468,6 @@ define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; NOOPT-NEXT: s_mov_b32 s0, 1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT: s_cbranch_vccnz .LBB20_4
@@ -9266,7 +9214,6 @@ define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; NOOPT-NEXT: s_mov_b32 s0, 1
-; NOOPT-NEXT: ; implicit-def: $sgpr1
; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0
; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
; NOOPT-NEXT: s_waitcnt vmcnt(0)
@@ -9530,8 +9477,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: s_mov_b32 s5, s0
; NOOPT-NEXT: s_mov_b32 s6, s2
; NOOPT-NEXT: s_mov_b32 s7, s3
-; NOOPT-NEXT: ; implicit-def: $sgpr0
-; NOOPT-NEXT: ; implicit-def: $sgpr0
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v2
; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill
@@ -9550,7 +9495,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: v_writelane_b32 v33, s10, 5
; NOOPT-NEXT: v_writelane_b32 v33, s11, 6
; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
-; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3
; NOOPT-NEXT: s_waitcnt expcnt(1)
; NOOPT-NEXT: v_mov_b32_e32 v0, s1
; NOOPT-NEXT: buffer_load_dword v0, v0, s[4:7], s0 offen
@@ -9763,10 +9707,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: v_mov_b32_e32 v12, v31
; NOOPT-NEXT: v_mov_b32_e32 v13, v30
; NOOPT-NEXT: v_mov_b32_e32 v17, v29
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v18, v13
; NOOPT-NEXT: v_mov_b32_e32 v19, v12
@@ -9774,10 +9714,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: v_mov_b32_e32 v12, v5
; NOOPT-NEXT: v_mov_b32_e32 v11, v4
; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], v[11:12], s[0:3], 0 addr64 offset:48
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v11, v16
; NOOPT-NEXT: v_mov_b32_e32 v12, v15
@@ -9785,10 +9721,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: v_mov_b32_e32 v15, v5
; NOOPT-NEXT: v_mov_b32_e32 v14, v4
; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[14:15], s[0:3], 0 addr64 offset:32
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
; NOOPT-NEXT: s_waitcnt expcnt(0)
; NOOPT-NEXT: v_mov_b32_e32 v10, v3
@@ -9797,10 +9729,6 @@ define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace
; NOOPT-NEXT: v_mov_b32_e32 v1, v4
; NOOPT-NEXT: v_mov_b32_e32 v2, v5
; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], v[1:2], s[0:3], 0 addr64 offset:16
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
-; NOOPT-NEXT: ; implicit-def: $sgpr4
; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; NOOPT-NEXT: v_mov_b32_e32 v1, v8
; NOOPT-NEXT: v_mov_b32_e32 v2, v7
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
index 9d98c701..d09b4fd 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
@@ -68,6 +68,6 @@ if.end:
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 0a493e51..59dfd71 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -141,9 +141,9 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_mov_b64 s[0:1], -1
; CHECK-NEXT: s_cbranch_vccz .LBB3_7
; CHECK-NEXT: ; %bb.6: ; %atomicrmw.global
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: global_atomic_add_f64 v0, v[2:3], s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_mov_b64 s[0:1], 0
@@ -165,9 +165,9 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: .LBB3_10: ; %atomicrmw.shared
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_cselect_b32 s0, s4, -1
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; CHECK-NEXT: ds_add_f64 v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: ds_add_f64 v0, v[2:3]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
index c34c974..4d8fb8d 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir
@@ -105,3 +105,60 @@ body: |
...
+---
+name: av_mov_b64_split
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 7
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+
+ ; CHECK-LABEL: name: av_mov_b64_split
+ ; CHECK: liveins: $agpr6, $agpr7, $agpr8, $agpr9, $vgpr0, $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr0_agpr1
+ ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec, implicit-def $agpr2_agpr3
+ ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr2_agpr3
+ ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec, implicit-def $agpr4_agpr5
+ ; CHECK-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr4_agpr5
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 3, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
+ ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1
+ ; CHECK-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2_agpr3
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr4_agpr5
+ ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $vgpr0_vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $vgpr0_vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0_vgpr1
+ %0:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ %1:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 1, implicit $exec
+ %2:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 2, implicit $exec
+ %3:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 3, implicit $exec
+ %4:vreg_64_align2 = AV_MOV_B64_IMM_PSEUDO 4, implicit $exec
+
+ %5:areg_64_align2 = COPY %0
+ %6:areg_64_align2 = COPY %1
+ %7:areg_64_align2 = COPY %2
+ %8:areg_64_align2 = COPY %3
+ %9:areg_64_align2 = COPY %4
+
+ S_NOP 0, implicit %5
+ S_NOP 0, implicit %6
+ S_NOP 0, implicit %7
+ S_NOP 0, implicit %8
+ S_NOP 0, implicit %9
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index c7767cb8..45c185b 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -16,15 +16,28 @@
ret void
}
- define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2() #0 {
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2() #0 {
ret void
}
- define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second() #1 {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first_physreg() #1 {
+ ret void
+ }
+
+ define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second_physreg() #1 {
ret void
}
attributes #0 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="8,8" }
+ attributes #1 = { "amdgpu-wave-limiter"="true" "amdgpu-waves-per-eu"="10,10" }
...
# Inflate pattern, except the defining instruction isn't an MFMA.
@@ -320,9 +333,9 @@ body: |
...
-# Non-mac variant, src2 is an immediate.
+# Non-mac variant, src2 is a physical register
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -330,7 +343,7 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_physreg_src2
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -346,7 +359,7 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
@@ -380,7 +393,7 @@ body: |
bb.1:
liveins: $vcc
- %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -403,9 +416,9 @@ body: |
...
-# Non-mac variant, src2 is the same VGPR, but a different subregister.
+# There isn't an assignable AGPR around the first MFMA.
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -413,7 +426,7 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -429,18 +442,20 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit-def renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit-def renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit-def renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit killed renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit killed renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit killed renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -456,23 +471,25 @@ body: |
bb.0:
S_NOP 0, implicit-def $agpr0
renamable $sgpr0 = S_MOV_B32 0
- undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr1 = COPY renamable $sgpr0
%1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- %0.sub9:vreg_1024_align2 = COPY %0.sub8
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
bb.1:
liveins: $vcc
- undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def %6:areg_512_align2, implicit-def %7:areg_512_align2, implicit-def %8:areg_512_align2, implicit-def %9:areg_512_align2
+ %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit %6, implicit %7, implicit %8, implicit %9
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
bb.2:
- ; No VGPRs available for %0
+ ; No VGPRs available for %0 or %4
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -481,11 +498,340 @@ body: |
S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
- %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# There isn't an assignable AGPR around the second MFMA.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit-def renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit-def renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit-def renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit killed renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit killed renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit killed renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit-def %6:areg_512_align2, implicit-def %7:areg_512_align2, implicit-def %8:areg_512_align2, implicit-def %9:areg_512_align2
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit %6, implicit %7, implicit %8, implicit %9
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0 or %4
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# There isn't an assignable AGPR around the first MFMA, with physreg interference
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first_physreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first_physreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ S_NOP 0, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ S_NOP 0, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ S_NOP 0, implicit-def $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ S_NOP 0, implicit-def $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ S_NOP 0, implicit-def $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ S_NOP 0, implicit-def $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ S_NOP 0, implicit-def $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ S_NOP 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ S_NOP 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ S_NOP 0, implicit $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ S_NOP 0, implicit $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ S_NOP 0, implicit $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ S_NOP 0, implicit $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ S_NOP 0, implicit $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0 or %4
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# There isn't an assignable AGPR around the second MFMA, physreg interference
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second_physreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second_physreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ ; CHECK-NEXT: S_NOP 0, implicit $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ S_NOP 0, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ S_NOP 0, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ S_NOP 0, implicit-def $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ S_NOP 0, implicit-def $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ S_NOP 0, implicit-def $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ S_NOP 0, implicit-def $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ S_NOP 0, implicit-def $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
+ S_NOP 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+ S_NOP 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ S_NOP 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23
+ S_NOP 0, implicit $agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+ S_NOP 0, implicit $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39
+ S_NOP 0, implicit $agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47
+ S_NOP 0, implicit $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55
+ S_NOP 0, implicit $agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0 or %4
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
S_ENDPGM 0
...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index b907c13..19d9470d 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -209,15 +209,14 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -296,16 +295,15 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -384,16 +382,15 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 killed $vgpr4_vgpr5, $vgpr2_vgpr3, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $agpr2_agpr3, $agpr0_agpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -445,6 +442,85 @@ body: |
...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses_cannot_rewrite_final_use
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_two_chained_uses_cannot_rewrite_final_use
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+ S_ENDPGM 0
+
+...
+
# There is a rewrite candidate, but it is used by another MFMA which
# does not have a tied result.
---
@@ -472,18 +548,17 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -559,18 +634,17 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -619,10 +693,9 @@ body: |
S_ENDPGM 0
...
-
-# There isn't an assignable AGPR around the first MFMA.
+# Chain of 2 untied cases, but the use isn't in src2.
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_non_src2
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -630,7 +703,7 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_non_src2
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -647,19 +720,16 @@ body: |
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit-def renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit-def renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit-def renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit killed renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit killed renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit killed renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
- ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -685,10 +755,8 @@ body: |
liveins: $vcc
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- S_NOP 0, implicit-def %6:areg_512_align2, implicit-def %7:areg_512_align2, implicit-def %8:areg_512_align2, implicit-def %9:areg_512_align2
%3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- S_NOP 0, implicit %6, implicit %7, implicit %8, implicit %9
- %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %3.sub4_sub5, %3.sub8_sub9, undef %6:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -711,9 +779,10 @@ body: |
...
-# There isn't an assignable AGPR around the second MFMA.
+# Chain of 2 untied cases, but the second mfma is a different size and
+# uses a subregister.
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_subreg
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -721,7 +790,7 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_second
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_subreg
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -737,20 +806,17 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit-def renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit-def renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit-def renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
- ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit killed renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit killed renamable $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47, implicit killed renamable $agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X16F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr2_agpr3_agpr4_agpr5, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -758,10 +824,7 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
@@ -777,9 +840,7 @@ body: |
undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
%3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- S_NOP 0, implicit-def %6:areg_512_align2, implicit-def %7:areg_512_align2, implicit-def %8:areg_512_align2, implicit-def %9:areg_512_align2
- %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %3, 0, 0, 0, implicit $mode, implicit $exec
- S_NOP 0, implicit %6, implicit %7, implicit %8, implicit %9
+ %4:vreg_128_align2 = V_MFMA_F32_16X16X16F16_vgprcd_e64 %1, %1, %3.sub2_sub3_sub4_sub5, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
@@ -794,6 +855,228 @@ body: |
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ S_ENDPGM 0
+
+...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_local_split
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_local_split
+ ; CHECK: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr10_vgpr11_vgpr12_vgpr13, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr14_vgpr15_vgpr16_vgpr17, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr2_vgpr3_vgpr4_vgpr5, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $vgpr6_vgpr7_vgpr8_vgpr9, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Performs a split and inflate around the single instruction
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_instruction_split
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_instruction_split
+ ; CHECK: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: SI_SPILL_AV64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr10_vgpr11_vgpr12_vgpr13, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr14_vgpr15_vgpr16_vgpr17, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr2_vgpr3_vgpr4_vgpr5, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $vgpr6_vgpr7_vgpr8_vgpr9, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Performs a split and inflate around the single instruction, non-tied case
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_instruction_split
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_instruction_split
+ ; CHECK: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: SI_SPILL_AV64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr10_vgpr11_vgpr12_vgpr13, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr14_vgpr15_vgpr16_vgpr17, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr2_vgpr3_vgpr4_vgpr5, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $vgpr6_vgpr7_vgpr8_vgpr9, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
@@ -802,9 +1085,11 @@ body: |
...
-# Chain of 2 untied cases, but the use isn't in src2.
+# This case does not fully use %0 after the MFMA. As a result,
+# SplitKits insert a copy bundle for the subset of used lanes instead
+# of a simple copy.
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_non_src2
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_instruction_split_partial_uses_only
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -812,7 +1097,442 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_non_src2
+ bb.0:
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_instruction_split_partial_uses_only
+ ; CHECK: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: SI_SPILL_AV64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = COPY killed renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = COPY renamable $agpr0_agpr1_agpr2_agpr3
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr10_vgpr11_vgpr12_vgpr13, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr14_vgpr15_vgpr16_vgpr17, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3_vgpr4_vgpr5, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Untied version of previous. This case does not fully use %4 after
+# the MFMA. As a result, SplitKits insert a copy bundle for the subset
+# of used lanes instead of a simple copy,
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_instruction_split_partial_uses_only
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_instruction_split_partial_uses_only
+ ; CHECK: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: SI_SPILL_AV64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = COPY killed renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = COPY renamable $agpr0_agpr1_agpr2_agpr3
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr10_vgpr11_vgpr12_vgpr13, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $vgpr14_vgpr15_vgpr16_vgpr17, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $vgpr2_vgpr3_vgpr4_vgpr5, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ ; No VGPRs available for %4
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ S_ENDPGM 0
+
+...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: renamable $vgpr11 = COPY renamable $vgpr10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_1024_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0.sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+---
+name: chained_mfma_dst_user_is_vgpr
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: chained_mfma_dst_user_is_vgpr
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $vcc
+
+ undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
+ early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, %4
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# TODO: In this trivial case, the single copy required is cheaper than
+# the tuple copy.
+---
+name: chained_mfma_dst_user_is_vgpr_small_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: chained_mfma_dst_user_is_vgpr_small_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead renamable $vgpr0 = nofpexcept V_CVT_F16_F32_e32 killed $vgpr0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $vcc
+
+ undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
+ early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %4.sub0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Transitive user of the register is an MFMA with non-register src2
+---
+name: chained_mfma_dst_user_has_imm_src2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: chained_mfma_dst_user_has_imm_src2
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -828,18 +1548,18 @@ body: |
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X4F16_e64 $agpr0_agpr1, $vgpr18_vgpr19, 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -853,6 +1573,8 @@ body: |
; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
bb.0:
+ successors: %bb.1(0x80000000)
+
S_NOP 0, implicit-def $agpr0
renamable $sgpr0 = S_MOV_B32 0
undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
@@ -862,16 +1584,103 @@ body: |
%0.sub9:vreg_512_align2 = COPY %0.sub8
bb.1:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
liveins: $vcc
- undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %3.sub4_sub5, %3.sub8_sub9, undef %6:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+ undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
+ early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %4.sub0_sub1_sub2_sub3:vreg_512_align2 = V_MFMA_F32_4X4X4F16_vgprcd_e64 %4.sub0_sub1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
bb.2:
- ; No VGPRs available for %0 or %4
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %6, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %6, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_has_untied_user
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_has_untied_user
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -889,10 +1698,8 @@ body: |
...
-# Chain of 2 untied cases, but the second mfma is a different size and
-# uses a subregister.
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_subreg
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_has_untied_user_with_vgpr_use
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -900,7 +1707,7 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_subreg
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_has_untied_user_with_vgpr_use
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
@@ -908,26 +1715,27 @@ body: |
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
- ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vgpr16_vgpr17 = COPY killed renamable $sgpr0_sgpr1
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
- ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19
+ ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_16X16X16F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -935,7 +1743,10 @@ body: |
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
bb.0:
S_NOP 0, implicit-def $agpr0
@@ -949,14 +1760,15 @@ body: |
bb.1:
liveins: $vcc
- undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %2:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
- %3:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
- %4:vreg_128_align2 = V_MFMA_F32_16X16X16F16_vgprcd_e64 %1, %1, %3.sub2_sub3_sub4_sub5, 0, 0, 0, implicit $mode, implicit $exec
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 28114953 /* reguse:VReg_512_Align2 */, %4
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
bb.2:
- ; No VGPRs available for %0 or %4
+ ; No VGPRs available for %0
S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -966,13 +1778,16 @@ body: |
S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- GLOBAL_STORE_DWORDX4_SADDR %5, %4, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
S_ENDPGM 0
...
---
-name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_has_tied_user
tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
@@ -980,34 +1795,199 @@ machineFunctionInfo:
occupancy: 10
sgprForEXECCopy: '$sgpr100_sgpr101'
body: |
- ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_mac_vgprcd_e64_same_subreg
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_has_tied_user
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
- ; CHECK-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
+ %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %4, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %5, %4.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Non-mac variant, src2 is an immediate.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_imm_src2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
- ; CHECK-NEXT: renamable $vgpr11 = COPY renamable $vgpr10
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr10_vgpr11_vgpr12_vgpr13 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0, implicit-def $agpr0
+ renamable $sgpr0 = S_MOV_B32 0
+ undef %0.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr1 = COPY renamable $sgpr0
+ %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
+ renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ %0.sub9:vreg_512_align2 = COPY %0.sub8
+
+ bb.1:
+ liveins: $vcc
+
+ %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ ; No VGPRs available for %0
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+ GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
+ S_ENDPGM 0
+
+...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $agpr0
+ ; CHECK-NEXT: renamable $sgpr0 = S_MOV_B32 0
+ ; CHECK-NEXT: renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $sgpr0_sgpr1
+ ; CHECK-NEXT: renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
+ ; CHECK-NEXT: dead renamable $vgpr9 = COPY renamable $vgpr8
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1034,11 +2014,9 @@ body: |
bb.1:
liveins: $vcc
- %0.sub0_sub1_sub2_sub3:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- %0.sub4_sub5_sub6_sub7:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- %0.sub8_sub9_sub10_sub11:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- %0.sub12_sub13_sub14_sub15:vreg_1024_align2 = GLOBAL_LOAD_DWORDX4 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s128), addrspace 1)
- %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, 0, 0, 0, implicit $mode, implicit $exec
+ undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCNZ %bb.1, implicit $vcc
S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
new file mode 100644
index 0000000..63cde0d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Use VGPRs above the input arguments.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count: 0x1d{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+ i32 %vcr, { i32 } %system.data,
+ i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+ i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+ i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+ local_unnamed_addr {
+entry:
+ %system.data.value = extractvalue { i32 } %system.data, 0
+ %dead.val = call i32 @llvm.amdgcn.dead.i32()
+ %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+ %system.data.extract = extractvalue { i32 } %system.data, 0
+ %data.mul = mul i32 %system.data.extract, 2
+ %data.add = add i32 %data.mul, 1
+ call void asm sideeffect "; clobber v28", "~{v28}"()
+ br label %tail
+
+tail:
+ %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+ %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+ %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+ %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+ %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+ %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+ %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+ %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+ %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+ %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+ %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+ %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+ %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+ %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+ %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+ %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+ %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+ %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+ %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+ %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+ %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+ %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+ %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+ %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+ %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+ %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+ %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+ %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+ call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+ @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+ ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+ { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+ i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
new file mode 100644
index 0000000..236f2c1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: leaf_shader:
+; CHECK: .vgpr_count: 0xc{{$}}
+
+; Function without calls.
+define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
+ i32 %active.vgpr1, i32 %active.vgpr2,
+ i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+ i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
+ local_unnamed_addr {
+entry:
+ %dead.val = call i32 @llvm.amdgcn.dead.i32()
+ %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %is.whole.wave, label %compute, label %merge
+
+compute:
+ ; Perform a more complex computation using active VGPRs
+ %square = mul i32 %active.vgpr1, %active.vgpr1
+ %product = mul i32 %square, %active.vgpr2
+ %sum = add i32 %product, %input.value
+ %result = add i32 %sum, 42
+ br label %merge
+
+merge:
+ %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
+ %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
+ %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
+ %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
+ %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
+ %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
+ %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
+
+ store i32 %final.result, ptr %output.ptr, align 4
+
+ ret void
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
new file mode 100644
index 0000000..515e0db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
+; In that case, the VGPR should be included in the .vgpr_count
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count: 0xd{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+ i32 %vcr, { i32 } %system.data,
+ i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+ i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+ i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+ local_unnamed_addr {
+entry:
+ %system.data.value = extractvalue { i32 } %system.data, 0
+ %dead.val = call i32 @llvm.amdgcn.dead.i32()
+ %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+ %system.data.extract = extractvalue { i32 } %system.data, 0
+ %data.mul = mul i32 %system.data.extract, 2
+ %data.add = add i32 %data.mul, 1
+ call void asm sideeffect "; clobber VGPR for %inactive.vgpr2", "~{v12}"()
+ br label %tail
+
+tail:
+ %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+ %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+ %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+ %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+ %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+ %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+ %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+ %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+ %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+ %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+ %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+ %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+ %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+ %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+ %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+ %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+ %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+ %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+ %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+ %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+ %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+ %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+ %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+ %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+ %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+ %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+ %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+ %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+ call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+ @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+ ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+ { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+ i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
new file mode 100644
index 0000000..2428f70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count: 0xa{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+ i32 %vcr, { i32 } %system.data,
+ i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+ i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+ i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+ local_unnamed_addr {
+entry:
+ %system.data.value = extractvalue { i32 } %system.data, 0
+ %dead.val = call i32 @llvm.amdgcn.dead.i32()
+ %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+ br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+ %system.data.extract = extractvalue { i32 } %system.data, 0
+ %data.mul = mul i32 %system.data.extract, 2
+ %data.add = add i32 %data.mul, 1
+ br label %tail
+
+tail:
+ %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+ %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+ %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+ %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+ %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+ %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+ %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+ %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+ %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+ %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+ %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+ %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+ %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+ %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+ %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+ %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+ %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+ %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+ %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+ %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+ %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+ %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+ %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+ %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+ %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+ %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+ %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+ %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+ call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+ @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+ ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+ { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+ i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint-err.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint-err.ll
new file mode 100644
index 0000000..8b45f0d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint-err.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck %s
+
+; Make sure illegal type uses are correctly diagnosed
+
+; CHECK: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_i8(i8 %x) {
+ call void asm sideeffect "; use $0", "^VA"(i8 %x)
+ ret void
+}
+
+; CHECK: error: couldn't allocate output register for constraint 'VA'
+define i8 @def_A_i8() {
+ %ret = call i8 asm sideeffect "; def $0", "=^VA"()
+ ret i8 %ret
+}
+
+; CHECK: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_i1(i1 %x) {
+ call void asm sideeffect "; use $0", "^VA"(i1 %x)
+ ret void
+}
+
+; CHECK: error: couldn't allocate output register for constraint 'VA'
+define i1 @def_A_i1() {
+ %ret = call i1 asm sideeffect "; def $0", "=^VA"()
+ ret i1 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint.ll
new file mode 100644
index 0000000..407a802
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-av-constraint.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+; FIXME: Shouldn't emit and
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_i16(i16 %x) {
+; CHECK-LABEL: use_A_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(i16 %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_f16(half %x) {
+; CHECK-LABEL: use_A_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(half %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_bf16(bfloat %x) {
+; CHECK-LABEL: use_A_bf16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(bfloat %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_v2i16(<2 x i16> %x) {
+; CHECK-LABEL: use_A_v2i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(<2 x i16> %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_v2f16(<2 x half> %x) {
+; CHECK-LABEL: use_A_v2f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(<2 x half> %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_v2bf16(<2 x bfloat> %x) {
+; CHECK-LABEL: use_A_v2bf16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(<2 x bfloat> %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_i32(i32 %x) {
+; CHECK-LABEL: use_A_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(i32 %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_f32(float %x) {
+; CHECK-LABEL: use_A_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(float %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_i64(i64 %x) {
+; CHECK-LABEL: use_A_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(i64 %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_f64(double %x) {
+; CHECK-LABEL: use_A_f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(double %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_p1(ptr addrspace(1) %x) {
+; CHECK-LABEL: use_A_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(ptr addrspace(1) %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_v32i32(<32 x i32> %x) {
+; CHECK-LABEL: use_A_v32i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(<32 x i32> %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate input reg for constraint 'VA'
+define void @use_A_v32f32(<32 x float> %x) {
+; CHECK-LABEL: use_A_v32f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "; use $0", "^VA"(<32 x float> %x)
+ ret void
+}
+
+; ERR: error: couldn't allocate output register for constraint 'VA'
+define i16 @def_A_i16() {
+; CHECK-LABEL: def_A_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %ret = call i16 asm sideeffect "; def $0", "=^VA"()
+ ret i16 %ret
+}
+
+; ERR: error: couldn't allocate output register for constraint 'VA'
+define i32 @def_A_i32() {
+; CHECK-LABEL: def_A_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %ret = call i32 asm sideeffect "; def $0", "=^VA"()
+ ret i32 %ret
+}
+
+; ERR: error: couldn't allocate output register for constraint 'VA'
+define ptr addrspace(1) @def_A_p1() {
+; CHECK-LABEL: def_A_p1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %ret = call ptr addrspace(1) asm sideeffect "; def $0", "=^VA"()
+ ret ptr addrspace(1) %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 0ca180e..3ce8a80 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %14
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %12
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %14
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, %12
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -44,19 +44,19 @@ define amdgpu_kernel void @v_input_output_i128() {
}
define amdgpu_kernel void @a_input_output_i128() {
+
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %13
; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
- ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, %14
-
+ ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %11
; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
- ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %12
+ ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 1e56f6f8..4ae0ba0 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -7,7 +7,7 @@ declare void @extern() #0
define float @foo(float %x) #0 {
; GCN-LABEL: define float @foo(
-; GCN-SAME: float [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; GCN-SAME: float [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; GCN-NEXT: [[ENTRY:.*:]]
; GCN-NEXT: tail call void @extern()
; GCN-NEXT: [[MUL:%.*]] = fmul float [[X]], 1.500000e+01
@@ -21,7 +21,7 @@ entry:
define amdgpu_kernel void @caller(ptr addrspace(1) %p) #1 {
; GCN-LABEL: define amdgpu_kernel void @caller(
-; GCN-SAME: ptr addrspace(1) captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; GCN-SAME: ptr addrspace(1) captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
; GCN-NEXT: [[ENTRY:.*:]]
; GCN-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[P]], align 4, !amdgpu.noclobber [[META0:![0-9]+]]
; GCN-NEXT: tail call void @extern()
@@ -40,17 +40,14 @@ attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="t
attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
;.
-; UNSAFE: attributes #[[ATTR0:[0-9]+]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
;.
-; NONANS: attributes #[[ATTR0:[0-9]+]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
;.
-; NOINFS: attributes #[[ATTR0:[0-9]+]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
;.
; UNSAFE: [[META0]] = !{}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 9f7f228..535e02c 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -18,6 +18,12 @@ define amdgpu_kernel void @v_input_output_i8() {
ret void
}
+; GCN: error: couldn't allocate input reg for constraint 'v'
+define amdgpu_kernel void @v_input_empty_struct() {
+ call void asm "", "v"({} poison)
+ ret void
+}
+
; SICI: error: couldn't allocate output register for constraint 's'
; SICI: error: couldn't allocate input reg for constraint 's'
; VI-NOT: error
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index f437dee..d33a809 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -489,3 +489,126 @@ body: |
S_ENDPGM 0
...
+---
+name: skip_barrier_init_imm
+body: |
+ ; CHECK-LABEL: name: skip_barrier_init_imm
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_NOP_e32 implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 -1
+ ; CHECK-NEXT: S_BARRIER_INIT_IMM -1, implicit $m0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ V_NOP_e32 implicit $exec
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_INIT_IMM -1, implicit $m0
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: skip_barrier_init_m0
+body: |
+ ; CHECK-LABEL: name: skip_barrier_init_m0
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_NOP_e32 implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 -1
+ ; CHECK-NEXT: S_BARRIER_INIT_M0 implicit $m0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ V_NOP_e32 implicit $exec
+ $m0 = S_MOV_B32 -1
+ S_BARRIER_INIT_M0 implicit $m0
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: skip_barrier_join_imm
+body: |
+ ; CHECK-LABEL: name: skip_barrier_join_imm
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_NOP_e32 implicit $exec
+ ; CHECK-NEXT: S_BARRIER_JOIN_IMM -1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ V_NOP_e32 implicit $exec
+ S_BARRIER_JOIN_IMM -1
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: skip_barrier_leave
+body: |
+ ; CHECK-LABEL: name: skip_barrier_leave
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_NOP_e32 implicit $exec
+ ; CHECK-NEXT: S_BARRIER_LEAVE implicit-def $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ V_NOP_e32 implicit $exec
+ S_BARRIER_LEAVE implicit-def $scc
+
+ bb.2:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index fb075221..58cd2f5 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -17,7 +17,7 @@ define fastcc i32 @foo() {
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
- ; CHECK-NEXT: BUNDLE implicit-def $sgpr16_sgpr17, implicit-def $sgpr16, implicit-def $sgpr16_lo16, implicit-def $sgpr16_hi16, implicit-def $sgpr17, implicit-def $sgpr17_lo16, implicit-def $sgpr17_hi16, implicit-def $scc {
+ ; CHECK-NEXT: BUNDLE implicit-def $sgpr16_sgpr17, implicit-def $sgpr16, implicit-def $scc, implicit-def $sgpr17 {
; CHECK-NEXT: $sgpr16_sgpr17 = S_GETPC_B64
; CHECK-NEXT: $sgpr16 = S_ADD_U32 internal $sgpr16, target-flags(amdgpu-gotprel32-lo) @bar + 4, implicit-def $scc
; CHECK-NEXT: $sgpr17 = S_ADDC_U32 internal $sgpr17, target-flags(amdgpu-gotprel32-hi) @bar + 12, implicit-def $scc, implicit internal $scc
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index d8c983a..b81fdd3 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; SI-LABEL: s_insertelement_v2bf16_0:
@@ -61,6 +62,19 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_insertelement_v2bf16_0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s2, s2, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_pack_ll_b32_b16 s2, 0x40a0, s2
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
%vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
store <2 x bfloat> %vecins, ptr addrspace(1) %out
@@ -122,6 +136,18 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: s_insertelement_v2bf16_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_pack_ll_b32_b16 s2, s2, 0x40a0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
%vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
store <2 x bfloat> %vecins, ptr addrspace(1) %out
@@ -193,6 +219,19 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_bfi_b32 v1, s2, v2, v1
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_movk_i32 s2, 0x40a0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -266,6 +305,17 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
; GFX942-NEXT: v_bfi_b32 v1, s2, 53, v1
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_0_inlineimm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -341,6 +391,19 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_perm_b32 v1, s2, v1, v2
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_movk_i32 s2, 0x40a0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -414,6 +477,17 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
; GFX942-NEXT: v_perm_b32 v1, 35, v1, v2
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_1_inlineimm:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_perm_b32 v1, 35, v1, 0x5040100
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -514,6 +588,25 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
; GFX942-NEXT: v_bfi_b32 v1, v1, s2, v2
; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b32 v1, v0, s[6:7] scale_offset
+; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bitop3_b32 v1, 0x12341234, v2, v1 bitop3:0xe4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -597,6 +690,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_bfi_b32 v0, s2, v3, v0
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -678,6 +784,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_perm_b32 v0, s6, v0, v3
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_perm_b32 v0, s4, v0, 0x5040100
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -761,6 +880,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_bfi_b32 v1, s2, v3, v1
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -842,6 +974,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_perm_b32 v1, s6, v1, v3
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -946,6 +1091,24 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
; GFX942-NEXT: v_bfi_b32 v0, s2, v4, v0
; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_lshl_b32 s2, s5, 4
+; GFX1250-NEXT: s_pack_ll_b32_b16 s4, s4, s4
+; GFX1250-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_bfi_b32 v1, s3, s4, v1
+; GFX1250-NEXT: v_bfi_b32 v0, s2, s4, v0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1028,6 +1191,19 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v8bf16_3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1244,6 +1420,50 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
; GFX942-NEXT: v_perm_b32 v0, v0, v9, s14
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v8bf16_dynamic:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 7
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 4
+; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 16, v3 :: v_dual_lshrrev_b32 v6, 16, v2
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 5
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 2
+; GFX1250-NEXT: v_dual_lshrrev_b32 v7, 16, v1 :: v_dual_lshrrev_b32 v8, 16, v0
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 3
+; GFX1250-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 1
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3
+; GFX1250-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2
+; GFX1250-NEXT: v_perm_b32 v3, v3, v5, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1342,6 +1562,26 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v16bf16_3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX1250-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1715,6 +1955,87 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16
; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v16bf16_dynamic:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX1250-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 7
+; GFX1250-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 4
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 5
+; GFX1250-NEXT: v_dual_lshrrev_b32 v10, 16, v2 :: v_dual_lshrrev_b32 v11, 16, v1
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 2
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 3
+; GFX1250-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v12, 16, v0 :: v_dual_lshrrev_b32 v13, 16, v7
+; GFX1250-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 1
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 14
+; GFX1250-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3
+; GFX1250-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
+; GFX1250-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 15
+; GFX1250-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 12
+; GFX1250-NEXT: v_dual_lshrrev_b32 v14, 16, v6 :: v_dual_lshrrev_b32 v15, 16, v5
+; GFX1250-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX1250-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 13
+; GFX1250-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 10
+; GFX1250-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 11
+; GFX1250-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 8
+; GFX1250-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; GFX1250-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: s_cmp_eq_u32 s5, 9
+; GFX1250-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_perm_b32 v7, v10, v7, 0x5040100
+; GFX1250-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2
+; GFX1250-NEXT: v_perm_b32 v6, v12, v6, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
+; GFX1250-NEXT: v_perm_b32 v4, v14, v4, 0x5040100
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
new file mode 100644
index 0000000..7b356d2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+; Demonstrate that the conversion of bitmasks affecting the sign bit on integers to srcmods
+; does not apply to canonicalizing instructions.
+
+define double @v_uitofp_i32_to_f64_abs(i32 %arg0) nounwind {
+; GCN-LABEL: v_uitofp_i32_to_f64_abs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i32_to_f64_abs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %arg0.abs = and i32 %arg0, u0x7fffffff
+ %cvt = uitofp i32 %arg0.abs to double
+ ret double %cvt
+}
+
+define double @v_uitofp_i32_to_f64_neg(i32 %arg0) nounwind {
+; GCN-LABEL: v_uitofp_i32_to_f64_neg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i32_to_f64_neg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %arg0.neg = and i32 %arg0, u0x80000000
+ %cvt = uitofp i32 %arg0.neg to double
+ ret double %cvt
+}
+
+define double @s_uitofp_i32_to_f64_abs(i32 inreg %arg0) nounwind {
+; GCN-LABEL: s_uitofp_i32_to_f64_abs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_bitset0_b32 s16, 31
+; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s16
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_uitofp_i32_to_f64_abs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %arg0.abs = and i32 %arg0, u0x7fffffff
+ %cvt = uitofp i32 %arg0.abs to double
+ ret double %cvt
+}
+
+define double @s_uitofp_i32_to_f64_neg(i32 inreg %arg0) nounwind {
+; GCN-LABEL: s_uitofp_i32_to_f64_neg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s16, 0x80000000
+; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_uitofp_i32_to_f64_neg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %arg0.neg = and i32 %arg0, u0x80000000
+ %cvt = uitofp i32 %arg0.neg to double
+ ret double %cvt
+}
+
+define half @v_uitofp_i16_to_f16_abs(i16 %arg0) nounwind {
+; GFX7-LABEL: v_uitofp_i16_to_f16_abs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i16_to_f16_abs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uitofp_i16_to_f16_abs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_i16_to_f16_abs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg0.abs = and i16 %arg0, u0x7fff
+ %cvt = uitofp i16 %arg0.abs to half
+ ret half %cvt
+}
+
+define half @v_uitofp_i16_to_f16_neg(i16 %arg0) nounwind {
+; GFX7-LABEL: v_uitofp_i16_to_f16_neg:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i16_to_f16_neg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_uitofp_i16_to_f16_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_i16_to_f16_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg0.neg = and i16 %arg0, u0x8000
+ %cvt = uitofp i16 %arg0.neg to half
+ ret half %cvt
+}
+
+define half @s_uitofp_i16_to_f16_abs(i16 inreg %arg0) nounwind {
+; GFX7-LABEL: s_uitofp_i16_to_f16_abs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_and_b32 s4, s16, 0x7fff
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_uitofp_i16_to_f16_abs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s16, 0x7fff
+; GFX9-NEXT: v_cvt_f16_u16_e32 v0, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: s_uitofp_i16_to_f16_abs:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: s_uitofp_i16_to_f16_abs:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg0.abs = and i16 %arg0, u0x7fff
+ %cvt = uitofp i16 %arg0.abs to half
+ ret half %cvt
+}
+
+define half @s_uitofp_i16_to_f16_neg(i16 inreg %arg0) nounwind {
+; GFX7-LABEL: s_uitofp_i16_to_f16_neg:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_and_b32 s4, s16, 0x8000
+; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_uitofp_i16_to_f16_neg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s4, s16, 0x8000
+; GFX9-NEXT: v_cvt_f16_u16_e32 v0, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: s_uitofp_i16_to_f16_neg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: s_uitofp_i16_to_f16_neg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %arg0.neg = and i16 %arg0, u0x8000
+ %cvt = uitofp i16 %arg0.neg to half
+ ret half %cvt
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 742d87f..31b6b53 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -1715,9 +1715,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
@@ -1745,8 +1745,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
@@ -1777,9 +1776,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16:
@@ -1815,8 +1814,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
-; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16:
@@ -9363,9 +9361,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
@@ -9409,8 +9407,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
@@ -9457,9 +9454,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h
-; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l
-; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
@@ -9511,8 +9508,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h
-; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll
new file mode 100644
index 0000000..b3c7ac8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll
@@ -0,0 +1,1011 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %b
+ ret i32 %select
+}
+
+define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v1, v2, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %b, i32 %neg.a
+ ret i32 %select
+}
+
+define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32_both:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_both:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %neg.b = xor i32 %b, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %neg.b
+ ret i32 %select
+}
+
+define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, -v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %abs.b = and i32 %a, u0x7fffffff
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %abs.b
+ ret i32 %select
+}
+
+define i32 @s_fneg_select_i32_1(i32 inreg %cond, i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: s_fneg_select_i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s4, s17, 0x80000000
+; GCN-NEXT: s_cmp_eq_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s18
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s1, s1, 0x80000000
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, s1, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %b
+ ret i32 %select
+}
+
+define i32 @s_fneg_1_fabs_2_select_i32(i32 inreg %cond, i32 %a, i32 %b) {
+; GCN-LABEL: s_fneg_1_fabs_2_select_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_cmp_eq_u32 s16, 0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: v_cndmask_b32_e64 v0, |v0|, -v0, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_1_fabs_2_select_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, |v0|, -v0, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i32 %a, u0x80000000
+ %abs.b = and i32 %a, u0x7fffffff
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %abs.b
+ ret i32 %select
+}
+
+define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_select_v2i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -v2, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+ ret <2 x i32> %select
+}
+
+define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_select_v2i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, v4, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+ ret <2 x i32> %select
+}
+
+define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fabs_select_i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i32 %a, u0x7fffffff
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %b
+ ret i32 %select
+}
+
+define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fabs_select_i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, |v1|, v2, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i32 %a, u0x7fffffff
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %b, i32 %neg.a
+ ret i32 %select
+}
+
+define <2 x i32> @fneg_1_fabs_2_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_v2i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_v2i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v2, |v2|, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, |v3|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+ %abs.b = and <2 x i32> %a, splat (i32 u0x7fffffff)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %abs.b, <2 x i32> %neg.a
+ ret <2 x i32> %select
+}
+
+define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_fabs_select_i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, -|v1|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i32 %a, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %neg.a, i32 %b
+ ret i32 %select
+}
+
+define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_fabs_select_i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v1|, v2, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i32 %a, u0x80000000
+ %cmp = icmp eq i32 %cond, zeroinitializer
+ %select = select i1 %cmp, i32 %b, i32 %neg.a
+ ret i32 %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_fabs_select_v2i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, -|v2|, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+ ret <2 x i32> %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_fabs_select_v2i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -|v2|, v4, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+ ret <2 x i32> %select
+}
+
+
+define <2 x i32> @s_fneg_select_v2i32_1(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) {
+; GCN-LABEL: s_fneg_select_v2i32_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_b32 s4, s19, 0x80000000
+; GCN-NEXT: s_xor_b32 s5, s18, 0x80000000
+; GCN-NEXT: s_cmp_eq_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s5, s5, s20
+; GCN-NEXT: s_cmp_eq_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, s21
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_v2i32_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, s2, s16
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, s3, s17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+ ret <2 x i32> %select
+}
+
+define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> inreg %a, <2 x i32> inreg %b) {
+; GCN-LABEL: s_fneg_fabs_select_v2i32_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_bitset1_b32 s19, 31
+; GCN-NEXT: s_bitset1_b32 s18, 31
+; GCN-NEXT: s_cmp_eq_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s4, s20, s18
+; GCN-NEXT: s_cmp_eq_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s5, s21, s19
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_v2i32_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, s16, s2
+; GFX11-NEXT: s_cmp_eq_u32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s1, s17, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+ %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+ %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+ ret <2 x i32> %select
+}
+
+define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_select_i64_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_select_i64_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, |v5|, -v3, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %abs.b = and i64 %b, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %abs.b
+ ret i64 %select
+}
+
+define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fabs_select_i64_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i64 %a, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fabs_select_i64_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i64 %a, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_fabs_select_i64_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_fabs_select_i64_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i64 @s_fneg_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_select_i64_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s18, s20
+; GFX7-NEXT: s_cselect_b32 s5, s6, s21
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_select_i64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s5, s18, s20
+; GFX9-NEXT: s_cselect_b32 s4, s4, s21
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s2, s16
+; GFX11-NEXT: s_cselect_b32 s1, s3, s17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @s_fneg_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_select_i64_2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s20, s18
+; GFX7-NEXT: s_cselect_b32 s5, s21, s6
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_select_i64_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s5, s20, s18
+; GFX9-NEXT: s_cselect_b32 s4, s21, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s16, s2
+; GFX11-NEXT: s_cselect_b32 s1, s17, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i64 @s_fneg_1_fabs_2_select_i64(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_xor_b32 s6, s19, 0x80000000
+; GFX7-NEXT: s_bitset0_b32 s21, 31
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s18, s20
+; GFX7-NEXT: s_cselect_b32 s5, s6, s21
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_xor_b32 s4, s19, 0x80000000
+; GFX9-NEXT: s_bitset0_b32 s21, 31
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s5, s18, s20
+; GFX9-NEXT: s_cselect_b32 s4, s4, s21
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_1_fabs_2_select_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_bitset0_b32 s17, 31
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s2, s16
+; GFX11-NEXT: s_cselect_b32 s1, s3, s17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i64 %a, u0x8000000000000000
+ %abs.b = and i64 %b, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %abs.b
+ ret i64 %select
+}
+
+define i64 @s_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fabs_select_i64_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_bitset0_b32 s19, 31
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s18, s20
+; GFX7-NEXT: s_cselect_b32 s5, s19, s21
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fabs_select_i64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_bitset0_b32 s19, 31
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s4, s18, s20
+; GFX9-NEXT: s_cselect_b32 s5, s19, s21
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fabs_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s2, s16
+; GFX11-NEXT: s_cselect_b32 s1, s3, s17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i64 %a, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @s_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fabs_select_i64_2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_bitset0_b32 s19, 31
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s20, s18
+; GFX7-NEXT: s_cselect_b32 s5, s21, s19
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fabs_select_i64_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_bitset0_b32 s19, 31
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s4, s20, s18
+; GFX9-NEXT: s_cselect_b32 s5, s21, s19
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fabs_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s16, s2
+; GFX11-NEXT: s_cselect_b32 s1, s17, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = and i64 %a, u0x7fffffffffffffff
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i64 @s_fneg_fabs_select_i64_1(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_fabs_select_i64_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_bitset1_b32 s19, 31
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s18, s20
+; GFX7-NEXT: s_cselect_b32 s5, s19, s21
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_fabs_select_i64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_bitset1_b32 s19, 31
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s4, s18, s20
+; GFX9-NEXT: s_cselect_b32 s5, s19, s21
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s2, s16
+; GFX11-NEXT: s_cselect_b32 s1, s3, s17
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %neg.a, i64 %b
+ ret i64 %select
+}
+
+define i64 @s_fneg_fabs_select_i64_2(i64 inreg %cond, i64 inreg %a, i64 inreg %b) {
+; GFX7-LABEL: s_fneg_fabs_select_i64_2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[16:17], 0
+; GFX7-NEXT: s_bitset1_b32 s19, 31
+; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT: s_cselect_b32 s4, s20, s18
+; GFX7-NEXT: s_cselect_b32 s5, s21, s19
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_fneg_fabs_select_i64_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_bitset1_b32 s19, 31
+; GFX9-NEXT: s_cmp_eq_u64 s[16:17], 0
+; GFX9-NEXT: s_cselect_b32 s4, s20, s18
+; GFX9-NEXT: s_cselect_b32 s5, s21, s19
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_fneg_fabs_select_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_cmp_eq_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, s16, s2
+; GFX11-NEXT: s_cselect_b32 s1, s17, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = or i64 %a, u0x8000000000000000
+ %cmp = icmp eq i64 %cond, zeroinitializer
+ %select = select i1 %cmp, i64 %b, i64 %neg.a
+ ret i64 %select
+}
+
+define i16 @fneg_select_i16_1(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i16 %a, u0x8000
+ %cmp = icmp eq i16 %cond, zeroinitializer
+ %select = select i1 %cmp, i16 %neg.a, i16 %b
+ ret i16 %select
+}
+
+define i16 @fneg_select_i16_2(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_2:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_2:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i16 %a, u0x8000
+ %cmp = icmp eq i16 %cond, zeroinitializer
+ %select = select i1 %cmp, i16 %b, i16 %neg.a
+ ret i16 %select
+}
+
+define i16 @fneg_select_i16_both(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_both:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_both:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_both:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_both:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i16 %a, u0x8000
+ %neg.b = xor i16 %b, u0x8000
+ %cmp = icmp eq i16 %cond, zeroinitializer
+ %select = select i1 %cmp, i16 %neg.a, i16 %neg.b
+ ret i16 %select
+}
+
+define i16 @fneg_1_fabs_2_select_i16(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_1_fabs_2_select_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_1_fabs_2_select_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l
+; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = xor i16 %a, u0x8000
+ %abs.b = and i16 %a, u0x7fff
+ %cmp = icmp eq i16 %cond, zeroinitializer
+ %select = select i1 %cmp, i16 %neg.a, i16 %abs.b
+ ret i16 %select
+}
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index 5dff7372..294c904 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
; Check illegal casts are codegened as poison, and not an error.
diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
index 8fc5afb..17c8010 100644
--- a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
ret i64 %ret
}
+
+declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
+
+; Make sure we don't pass the first argument (i1).
+define amdgpu_cs void @call(i32 %x, ptr %p) {
+ ; CHECK-LABEL: name: call
+ ; CHECK: bb.1 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+ ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
+ ; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
+ ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+ ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
+ store i32 %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
index b15ddc9..ece8662 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll
@@ -693,8 +693,6 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32
; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
@@ -715,8 +713,6 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32
; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
index 57b865d..6c9c7a4 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll
@@ -693,8 +693,6 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32
; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
; DAGISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
@@ -715,8 +713,6 @@ define amdgpu_cs_chain void @nonuniform_callee(ptr %callee, i32 inreg %sgpr, i32
; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr9
; DAGISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr8
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index a873c01..ea3cd8a 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -377,8 +377,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
; DAGISEL-GFX11-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; DAGISEL-GFX11-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
@@ -424,8 +422,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
; DAGISEL-GFX10-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; DAGISEL-GFX10-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ab99defc..fdcb033 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -84,13 +84,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
@@ -111,13 +107,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
@@ -138,13 +130,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
@@ -165,13 +153,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc(<4 x i32> inre
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY1]], 0, implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY6]], [[COPY2]], 0, implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY7]], [[COPY3]], 0, implicit $exec
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_ADD_U32_e64_3]], %subreg.sub0, [[V_ADD_U32_e64_2]], %subreg.sub1, [[V_ADD_U32_e64_1]], %subreg.sub2, [[V_ADD_U32_e64_]], %subreg.sub3
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY9:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 killed [[COPY8]], killed [[COPY9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`)
; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
@@ -260,14 +244,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
@@ -298,14 +278,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
@@ -336,14 +312,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
@@ -374,14 +346,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_ptr(ptr inreg
; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX2 killed [[COPY16]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.b)
@@ -498,41 +466,35 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
; DAGISEL-GFX11-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX11-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; DAGISEL-GFX11-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF2]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
@@ -555,41 +517,35 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
; DAGISEL-GFX11-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX11-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX11-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; DAGISEL-GFX11-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF2]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
@@ -612,41 +568,35 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
; DAGISEL-GFX10-WF32-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX10-WF32-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-WF32-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; DAGISEL-GFX10-WF32-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF2]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
@@ -669,41 +619,35 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_struct( {ptr,
; DAGISEL-GFX10-WF64-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr2
; DAGISEL-GFX10-WF64-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; DAGISEL-GFX10-WF64-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; DAGISEL-GFX10-WF64-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY14]], %subreg.sub0, [[COPY15]], %subreg.sub1, [[COPY16]], %subreg.sub2, [[COPY17]], %subreg.sub3
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY13]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY18]], %subreg.sub0, [[COPY19]], %subreg.sub1
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF6]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY20:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY20]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF7]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY21:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY11]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY21]], [[COPY22]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF8]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY23:%[0-9]+]]:vreg_64 = COPY [[DEF2]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY23]], killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY24:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY25:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX4 [[COPY24]], killed [[COPY25]], 0, 0, implicit $exec :: (store (s128) into `ptr addrspace(1) poison` + 16, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF10:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF10]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY26:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY26]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison` + 8, align 8, basealign 16, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF11:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF11]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY27:%[0-9]+]]:vreg_64 = COPY [[DEF5]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY28:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY27]], killed [[COPY28]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, align 16, addrspace 1)
; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
@@ -1289,22 +1233,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
@@ -1337,22 +1273,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
@@ -1385,22 +1313,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
; DAGISEL-GFX10-WF32-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
@@ -1433,22 +1353,14 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_v16i16(<16 x i
; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_5:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY9]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_6:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY10]], 8, [[COPY2]], 0, 0, 0, 0, 0, implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_PK_ADD_U16_7:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[COPY11]], 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $exec
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_7]], %subreg.sub0, [[V_PK_ADD_U16_6]], %subreg.sub1, [[V_PK_ADD_U16_5]], %subreg.sub2, [[V_PK_ADD_U16_4]], %subreg.sub3
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF4]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY16:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY17:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY16]], killed [[COPY17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison` + 16)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_PK_ADD_U16_3]], %subreg.sub0, [[V_PK_ADD_U16_2]], %subreg.sub1, [[V_PK_ADD_U16_1]], %subreg.sub2, [[V_PK_ADD_U16_]], %subreg.sub3
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF9:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF9]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY18:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY19:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
; DAGISEL-GFX10-WF64-NEXT: FLAT_STORE_DWORDX4 [[COPY18]], killed [[COPY19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr poison`, align 32)
; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
@@ -1503,11 +1415,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
; DAGISEL-GFX11-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX11-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX11-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0
@@ -1523,11 +1433,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
; DAGISEL-GFX11-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX11-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX11-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX11-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX11-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX11-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX11-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0
@@ -1543,11 +1451,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
; DAGISEL-GFX10-WF32-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX10-WF32-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX10-WF32-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF32-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF32-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; DAGISEL-GFX10-WF32-NEXT: S_ENDPGM 0
@@ -1563,11 +1469,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_many_regs(<36 x i
; DAGISEL-GFX10-WF64-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORD [[COPY3]], [[COPY4]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1)
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-GFX10-WF64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-GFX10-WF64-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF3]]
+ ; DAGISEL-GFX10-WF64-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; DAGISEL-GFX10-WF64-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF1]]
; DAGISEL-GFX10-WF64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
; DAGISEL-GFX10-WF64-NEXT: GLOBAL_STORE_DWORDX2 [[COPY5]], killed [[COPY6]], 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) poison`, addrspace 1)
; DAGISEL-GFX10-WF64-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
index 3450d63..44c162c 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll
@@ -135,11 +135,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec
; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]]
@@ -158,7 +154,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec
; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]]
; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]]
- ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1
;
; GISEL-LABEL: name: ret_64
@@ -189,3 +185,77 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
ret i64 %ret
}
+declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, <8 x i32> %x)
+
+; Make sure we don't pass the first argument (i1).
+define amdgpu_cs void @call(<8 x i32> %x, ptr %p) {
+ ; DAGISEL-LABEL: name: call
+ ; DAGISEL: bb.0 (%ir-block.0):
+ ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+ ; DAGISEL-NEXT: {{ $}}
+ ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; DAGISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+ ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+ ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+ ; DAGISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; DAGISEL-NEXT: $vgpr0 = COPY [[COPY9]]
+ ; DAGISEL-NEXT: $vgpr1 = COPY [[COPY8]]
+ ; DAGISEL-NEXT: $vgpr2 = COPY [[COPY7]]
+ ; DAGISEL-NEXT: $vgpr3 = COPY [[COPY6]]
+ ; DAGISEL-NEXT: $vgpr4 = COPY [[COPY5]]
+ ; DAGISEL-NEXT: $vgpr5 = COPY [[COPY4]]
+ ; DAGISEL-NEXT: $vgpr6 = COPY [[COPY3]]
+ ; DAGISEL-NEXT: $vgpr7 = COPY [[COPY2]]
+ ; DAGISEL-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[REG_SEQUENCE1]], @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit-def $vgpr0
+ ; DAGISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; DAGISEL-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; DAGISEL-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+ ; DAGISEL-NEXT: FLAT_STORE_DWORD killed [[COPY11]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.p)
+ ; DAGISEL-NEXT: S_ENDPGM 0
+ ;
+ ; GISEL-LABEL: name: call
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GISEL-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GISEL-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+ ; GISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+ ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: $vgpr0 = COPY [[COPY]]
+ ; GISEL-NEXT: $vgpr1 = COPY [[COPY1]]
+ ; GISEL-NEXT: $vgpr2 = COPY [[COPY2]]
+ ; GISEL-NEXT: $vgpr3 = COPY [[COPY3]]
+ ; GISEL-NEXT: $vgpr4 = COPY [[COPY4]]
+ ; GISEL-NEXT: $vgpr5 = COPY [[COPY5]]
+ ; GISEL-NEXT: $vgpr6 = COPY [[COPY6]]
+ ; GISEL-NEXT: $vgpr7 = COPY [[COPY7]]
+ ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee
+ ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee
+ ; GISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+ ; GISEL-NEXT: $sgpr30_sgpr31 = SI_CALL [[REG_SEQUENCE1]], @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit-def $vgpr0
+ ; GISEL-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.p)
+ ; GISEL-NEXT: S_ENDPGM 0
+ %ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x i32> %x) convergent
+ store i32 %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
index 7af6d3a..8e87256 100644
--- a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
@@ -55,8 +55,8 @@ define amdgpu_kernel void @issue120256_private(ptr addrspace(1) %out) {
; FIXME: Inference of amdgpu-no-queue-ptr should not depend on code object version.
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
;.
; CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014..3001248 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
-; CHECK-NEXT: s_add_i32 s1, s32, 0xf4
-; CHECK-NEXT: s_add_i32 s2, s32, 0xf8
-; CHECK-NEXT: s_add_i32 s3, s32, 0xfc
+; CHECK-NEXT: s_movk_i32 s1, 0xf4
+; CHECK-NEXT: s_movk_i32 s2, 0xf8
+; CHECK-NEXT: s_movk_i32 s3, 0xfc
+; CHECK-NEXT: s_movk_i32 s34, 0x100
; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: s_add_i32 s34, s32, 0x100
-; CHECK-NEXT: s_add_i32 s35, s32, 0x104
-; CHECK-NEXT: s_add_i32 s36, s32, 0x108
-; CHECK-NEXT: s_add_i32 s37, s32, 0x110
-; CHECK-NEXT: s_add_i32 s38, s32, 0x120
+; CHECK-NEXT: s_movk_i32 s35, 0x104
+; CHECK-NEXT: s_movk_i32 s36, 0x108
+; CHECK-NEXT: s_movk_i32 s37, 0x110
+; CHECK-NEXT: s_movk_i32 s38, 0x120
+; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
+; CHECK-NEXT: s_add_i32 s1, s32, s1
+; CHECK-NEXT: s_add_i32 s2, s32, s2
+; CHECK-NEXT: s_add_i32 s3, s32, s3
+; CHECK-NEXT: s_add_i32 s34, s32, s34
+; CHECK-NEXT: s_add_i32 s35, s32, s35
+; CHECK-NEXT: s_add_i32 s36, s32, s36
+; CHECK-NEXT: s_add_i32 s37, s32, s37
+; CHECK-NEXT: s_add_i32 s38, s32, s38
; CHECK-NEXT: s_or_b32 s39, s32, 4
; CHECK-NEXT: s_or_b32 s40, s32, 8
; CHECK-NEXT: s_or_b32 s41, s32, 12
diff --git a/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll b/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll
new file mode 100644
index 0000000..75c5d20
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
+
+define <3 x float> @extract_subvector_v3f32_v33f32_elt30_0(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:112 glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, v5
+; GFX900-NEXT: v_mov_b32_e32 v1, v6
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:112 sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, v6
+; GFX942-NEXT: v_mov_b32_e32 v1, v7
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %val = load volatile <33 x float>, ptr addrspace(1) %ptr, align 4
+ %extract.subvector = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
+ ret <3 x float> %extract.subvector
+}
+
+define <3 x float> @extract_subvector_v3f32_v33f32_elt30_1(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
+; GFX900-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:112
+; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128
+; GFX900-NEXT: s_mov_b32 s4, 0
+; GFX900-NEXT: s_mov_b32 s5, s4
+; GFX900-NEXT: s_mov_b32 s6, s4
+; GFX900-NEXT: s_mov_b32 s7, s4
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: v_mov_b32_e32 v0, v9
+; GFX900-NEXT: v_mov_b32_e32 v1, v10
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
+; GFX942-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:112
+; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128
+; GFX942-NEXT: s_mov_b32 s0, 0
+; GFX942-NEXT: s_mov_b32 s1, s0
+; GFX942-NEXT: s_mov_b32 s2, s0
+; GFX942-NEXT: s_mov_b32 s3, s0
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: v_mov_b32_e32 v0, v10
+; GFX942-NEXT: v_mov_b32_e32 v1, v11
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %val = load <33 x float>, ptr addrspace(1) %ptr, align 4
+ %val.slice.0 = shufflevector <33 x float> %val, <33 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
+ %val.slice.48 = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
+ ret <3 x float> %val.slice.48
+}
+
+define <6 x float> @extract_subvector_v6f32_v36f32_elt30(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: extract_subvector_v6f32_v36f32_elt30:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
+; GFX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
+; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
+; GFX900-NEXT: s_mov_b32 s4, 0
+; GFX900-NEXT: s_mov_b32 s5, s4
+; GFX900-NEXT: s_mov_b32 s6, s4
+; GFX900-NEXT: s_mov_b32 s7, s4
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: v_mov_b32_e32 v0, v12
+; GFX900-NEXT: v_mov_b32_e32 v1, v13
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: extract_subvector_v6f32_v36f32_elt30:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
+; GFX942-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
+; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
+; GFX942-NEXT: s_mov_b32 s0, 0
+; GFX942-NEXT: s_mov_b32 s1, s0
+; GFX942-NEXT: s_mov_b32 s2, s0
+; GFX942-NEXT: s_mov_b32 s3, s0
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GFX942-NEXT: s_waitcnt vmcnt(2)
+; GFX942-NEXT: v_mov_b32_e32 v0, v12
+; GFX942-NEXT: v_mov_b32_e32 v1, v13
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %val = load <36 x float>, ptr addrspace(1) %ptr, align 4
+ %val.slice.0 = shufflevector <36 x float> %val, <36 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
+ %val.slice.1 = shufflevector <36 x float> %val, <36 x float> poison, <6 x i32> <i32 30, i32 31, i32 32, i32 33, i32 34, i32 35>
+ ret <6 x float> %val.slice.1
+}
+
+define <3 x float> @issue153808_vector_extract_assert(ptr addrspace(1) %ptr) #0 {
+; GFX900-LABEL: issue153808_vector_extract_assert:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v4, v1
+; GFX900-NEXT: v_mov_b32_e32 v3, v0
+; GFX900-NEXT: global_load_dwordx4 v[5:8], v[3:4], off
+; GFX900-NEXT: global_load_dwordx3 v[0:2], v[3:4], off offset:192
+; GFX900-NEXT: s_mov_b32 s4, 0
+; GFX900-NEXT: s_mov_b32 s5, s4
+; GFX900-NEXT: s_mov_b32 s6, s4
+; GFX900-NEXT: s_mov_b32 s7, s4
+; GFX900-NEXT: s_waitcnt vmcnt(1)
+; GFX900-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: issue153808_vector_extract_assert:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
+; GFX942-NEXT: global_load_dwordx3 v[2:4], v[0:1], off offset:192
+; GFX942-NEXT: s_mov_b32 s0, 0
+; GFX942-NEXT: s_mov_b32 s1, s0
+; GFX942-NEXT: s_mov_b32 s2, s0
+; GFX942-NEXT: s_mov_b32 s3, s0
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
+; GFX942-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
+; GFX942-NEXT: v_mov_b32_e32 v1, v3
+; GFX942-NEXT: v_mov_b32_e32 v2, v4
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %val = load <51 x float>, ptr addrspace(1) %ptr, align 4
+ %val.slice.0 = shufflevector <51 x float> %val, <51 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
+ %val.slice.48 = shufflevector <51 x float> %val, <51 x float> poison, <3 x i32> <i32 48, i32 49, i32 50>
+ ret <3 x float> %val.slice.48
+}
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index c316ec71..9684712 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -797,12 +797,11 @@ define double @sitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GISEL-NEXT: .LBB2_13: ; %Flow4
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6
; GISEL-NEXT: v_mov_b32_e32 v2, 0x3ff00000
-; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff
+; GISEL-NEXT: v_and_b32_e32 v1, 0x80000000, v6
; GISEL-NEXT: v_lshl_add_u32 v2, v8, 20, v2
-; GISEL-NEXT: v_and_or_b32 v1, v10, v3, v1
-; GISEL-NEXT: v_or3_b32 v1, v1, v2, 0
+; GISEL-NEXT: v_and_b32_e32 v3, 0xfffff, v10
+; GISEL-NEXT: v_or3_b32 v1, v3, v1, v2
; GISEL-NEXT: .LBB2_14: ; %Flow5
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1081,8 +1080,8 @@ define double @uitofp_i128_to_f64(i128 %x) {
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000
; GISEL-NEXT: v_lshl_add_u32 v0, v7, 20, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9
-; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0xfffff
+; GISEL-NEXT: v_and_or_b32 v5, v9, v1, v0
; GISEL-NEXT: .LBB3_14: ; %Flow5
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: v_mov_b32_e32 v0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index d23509b..b91963f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,6 +150,7 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
; GFX10CU-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX10CU-NEXT: v_mov_b32_e32 v0, s13
; GFX10CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10CU-NEXT: s_waitcnt_depctr 0xffe3
; GFX10CU-NEXT: s_barrier
; GFX10CU-NEXT: ds_read_b32 v0, v0
; GFX10CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
new file mode 100644
index 0000000..da92dcd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; GFX1250 supports upto 320 KB LDS memory.
+; This is a negative test to check when the LDS size exceeds the max usable limit.
+
+; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
+@dst = addrspace(3) global [81921 x i32] undef
+
+define amdgpu_kernel void @test_lds_limit(i32 %val) {
+ %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
+ store i32 %val, ptr addrspace(3) %gep
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
new file mode 100644
index 0000000..3db0fa8f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+; GCN-LABEL: test_lds_i8:
+; GCN: .amdhsa_group_segment_fixed_size 1
+; GCN: ; LDSByteSize: 1 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_i16:
+; GCN: .amdhsa_group_segment_fixed_size 2
+; GCN: ; LDSByteSize: 2 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_i32:
+; GCN: .amdhsa_group_segment_fixed_size 4
+; GCN: ; LDSByteSize: 4 bytes/workgroup
+; MESA: granulated_lds_size = 1
+define amdgpu_kernel void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i8:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i16:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+; GCN-LABEL: test_lds_array_i32:
+; GCN: .amdhsa_group_segment_fixed_size 327680
+; GCN: ; LDSByteSize: 327680 bytes/workgroup
+; MESA: granulated_lds_size = 320
+define amdgpu_kernel void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
new file mode 100644
index 0000000..bfa7d37
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
+
+; GFX1250 supports upto 320 KB configurable LDS memory.
+; This test checks the min and max size of LDS that can be allocated.
+
+; PAL: .shader_functions:
+; PAL: test_lds_array_i16:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i32:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_array_i8:
+; PAL: .lds_size: 0x50000
+; PAL: test_lds_i16:
+; PAL: .lds_size: 0x2
+; PAL: test_lds_i32:
+; PAL: .lds_size: 0x4
+; PAL: test_lds_i8:
+; PAL: .lds_size: 0x1
+
+@lds.i8 = addrspace(3) global i8 undef
+@lds.array.i8 = addrspace(3) global [327679 x i8] undef
+@lds.i16 = addrspace(3) global i16 undef
+@lds.array.i16 = addrspace(3) global [163839 x i16] undef
+@lds.i32 = addrspace(3) global i32 undef
+@lds.array.i32 = addrspace(3) global [81919 x i32] undef
+
+define amdgpu_gfx void @test_lds_i8(i8 %val) {
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i16(i16 %val) {
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_i32(i32 %val) {
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i8() {
+ %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
+ %val = load i8, ptr addrspace(3) %gep
+ store i8 %val, ptr addrspace(3) @lds.i8
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i16() {
+ %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
+ %val = load i16, ptr addrspace(3) %gep
+ store i16 %val, ptr addrspace(3) @lds.i16
+ ret void
+}
+
+define amdgpu_gfx void @test_lds_array_i32() {
+ %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
+ %val = load i32, ptr addrspace(3) %gep
+ store i32 %val, ptr addrspace(3) @lds.i32
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-metadata.ll
new file mode 100644
index 0000000..270c17f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-metadata.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=PAL %s
+
+;test if zero lds_size of f2 doesn't overwrite f1
+@x = addrspace(3) global i32 poison
+
+; PAL: .hardware_stages:
+; PAL: .lds_size: 0x200
+; PAL: .shader_functions:
+; PAL: f1:
+; PAL: .lds_size: 0x4
+; PAL: f2:
+; PAL: .lds_size: 0
+
+define amdgpu_gfx void @f1(i32 %val) {
+ store i32 %val, ptr addrspace(3) @x
+ ret void
+}
+
+define amdgpu_gfx void @f2(i32 %a, ptr addrspace(1) %ptr) {
+ store i32 %a, ptr addrspace(1) %ptr
+ ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!8}
+!8 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\10\00\00\06\CE\FF\FF\FF\FF\00\01\02\03\04\05\06\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\10\00\00\02\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\10\AB.vgpr_limit\CC\80\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF\F6\B5\A6D\E3\BE\9D\D6\CFF\\=l\09\AB\F0#\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\07\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF-ua\DD\EA7\19\94\CF\80\16\9A\FC\9B\A6\1Dk\AD.llpc_version\A477.4\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
index 1280531..f074d03 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll
@@ -14,12 +14,7 @@ define half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -67,12 +62,7 @@ define <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -120,12 +110,7 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -177,12 +162,7 @@ define half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -230,12 +210,7 @@ define <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
index 338b0ea..8f67185 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll
@@ -14,12 +14,7 @@ define float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -67,12 +62,7 @@ define <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -123,12 +113,7 @@ define <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -181,12 +166,7 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -242,12 +222,7 @@ define float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -295,12 +270,7 @@ define <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
index 873c701..2ac8c09 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll
@@ -14,12 +14,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -68,12 +63,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -122,12 +112,7 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -176,12 +161,7 @@ define float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -230,12 +210,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -284,12 +259,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -338,12 +308,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -392,12 +357,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_dlc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -446,12 +406,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_dlc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -500,12 +455,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc_slc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -554,12 +504,7 @@ define <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -610,12 +555,7 @@ define <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -668,12 +608,7 @@ define <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -728,12 +663,7 @@ define half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -782,12 +712,7 @@ define <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -835,12 +760,7 @@ define <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -891,12 +811,7 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -946,12 +861,7 @@ define float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1001,12 +911,7 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1056,12 +961,7 @@ define float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1112,12 +1012,7 @@ define half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1167,12 +1062,7 @@ define float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1221,12 +1111,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffse
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1273,12 +1158,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1325,13 +1205,8 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1379,12 +1254,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1433,12 +1303,7 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1487,14 +1352,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1542,10 +1402,6 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1591,10 +1447,6 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1641,14 +1493,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1697,14 +1544,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1753,14 +1595,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1810,14 +1647,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1867,14 +1699,9 @@ define float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
index 7e020dd..d7037f1 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll
@@ -14,12 +14,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -66,12 +61,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -119,12 +109,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -173,16 +158,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -232,16 +210,9 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -288,10 +259,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -337,10 +304,6 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -387,12 +350,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -441,12 +399,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -495,14 +448,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -554,18 +502,11 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
index f999515..2daa826 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll
@@ -14,12 +14,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -66,12 +61,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_so
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -120,16 +110,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -179,17 +162,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -240,18 +215,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -302,18 +268,9 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -361,13 +318,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
@@ -415,13 +366,7 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
@@ -470,16 +415,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -529,16 +467,9 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -588,18 +519,11 @@ define void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -653,20 +577,11 @@ define void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_so
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
index eb95368..78cb38a 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll
@@ -15,12 +15,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -69,12 +64,7 @@ define void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffset(<
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -123,12 +113,7 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -177,12 +162,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -231,12 +211,7 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffset(<
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -284,12 +259,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -337,12 +307,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -390,12 +355,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -443,12 +403,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_d
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -496,12 +451,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_s
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -549,12 +499,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -602,12 +547,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_g
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -656,16 +596,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -715,17 +648,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -776,18 +701,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -835,12 +751,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -889,12 +800,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_i
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -943,13 +849,8 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -997,12 +898,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1051,16 +947,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1109,16 +998,9 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1165,12 +1047,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4095
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1217,13 +1094,8 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset4096
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1271,12 +1143,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1325,12 +1192,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1379,14 +1241,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1434,10 +1291,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1483,10 +1336,6 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1533,12 +1382,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1587,12 +1431,7 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1641,14 +1480,9 @@ define void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1698,14 +1532,9 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_o
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1754,13 +1583,8 @@ define void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffset_o
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
index 3012767..f5fd059 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll
@@ -14,22 +14,13 @@ define half @raw_ptr_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -77,22 +68,13 @@ define <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sg
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -140,22 +122,13 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -207,22 +180,13 @@ define half @raw_ptr_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -270,22 +234,13 @@ define <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffset__sg
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
index 07b63a8..36cb962 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll
@@ -14,22 +14,13 @@ define float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -77,22 +68,13 @@ define <2 x float> @raw_ptr_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voffset__s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -143,22 +125,13 @@ define <3 x float> @raw_ptr_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voffset__s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -211,22 +184,13 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -282,22 +246,13 @@ define float @raw_ptr_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -345,22 +300,13 @@ define <4 x float> @raw_ptr_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voffset__s
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
index c9c24e2..8c92175 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll
@@ -14,22 +14,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -78,22 +69,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -142,22 +124,13 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -206,22 +179,13 @@ define float @raw_ptr_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -270,22 +234,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -334,22 +289,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -398,22 +344,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -462,22 +399,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -526,22 +454,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -590,22 +509,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -654,22 +564,13 @@ define <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -720,22 +621,13 @@ define <3 x float> @raw_ptr_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -788,22 +680,13 @@ define <4 x float> @raw_ptr_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -858,22 +741,13 @@ define half @raw_ptr_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -922,22 +796,13 @@ define <2 x half> @raw_ptr_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -985,22 +850,13 @@ define <4 x half> @raw_ptr_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1051,22 +907,13 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zext
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1116,22 +963,13 @@ define float @raw_ptr_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sext
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1181,22 +1019,13 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_zex
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1246,22 +1075,13 @@ define float @raw_ptr_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sex
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1312,22 +1132,13 @@ define half @raw_ptr_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1377,22 +1188,13 @@ define float @raw_ptr_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1441,22 +1243,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__vo
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1503,22 +1296,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1565,23 +1349,14 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1629,22 +1404,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1693,22 +1459,13 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1757,24 +1514,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__vo
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1822,20 +1570,12 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1881,20 +1621,12 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY5]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1941,24 +1673,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2007,24 +1730,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2073,24 +1787,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2140,24 +1845,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_sof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2207,24 +1903,15 @@ define float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_vof
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
index 85d4ddc..9c0247a 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll
@@ -14,22 +14,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -76,22 +67,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -139,22 +121,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -203,26 +176,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -272,26 +234,15 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -338,20 +289,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -397,20 +340,12 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -457,22 +392,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -521,22 +447,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -585,24 +502,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -654,28 +562,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
index 89dbb03..3160899 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll
@@ -14,22 +14,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -76,22 +67,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgp
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -140,26 +122,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -209,27 +180,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -280,28 +239,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -352,28 +298,15 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -421,23 +354,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
@@ -485,23 +408,13 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
@@ -550,26 +463,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -619,26 +521,15 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -688,28 +579,17 @@ define void @raw_ptr_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -763,30 +643,17 @@ define void @raw_ptr_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffset__sgp
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
index c44ebaf..e8d3d34 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll
@@ -15,22 +15,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -79,22 +70,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -143,22 +125,13 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -207,22 +180,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -271,22 +235,13 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -334,22 +289,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -397,22 +343,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -460,22 +397,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -523,22 +451,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -586,22 +505,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -649,22 +559,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -712,22 +613,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -776,26 +668,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -845,27 +726,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -916,28 +785,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -985,22 +841,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1049,22 +896,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1113,23 +951,14 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1177,22 +1006,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1241,26 +1061,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1309,26 +1118,15 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1375,22 +1173,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1437,23 +1226,14 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_voffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1501,22 +1281,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1565,22 +1336,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1629,24 +1391,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1694,20 +1447,12 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1753,20 +1498,12 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1813,22 +1550,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1877,22 +1605,13 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1941,24 +1660,15 @@ define void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2008,24 +1718,15 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -2074,23 +1775,14 @@ define void @raw_ptr_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr_soffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
index 1d1d4a4..f35d1f1 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll
@@ -13,22 +13,13 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr add
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -76,22 +67,13 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -139,22 +121,13 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -205,22 +178,13 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr add
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -268,22 +232,13 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -331,22 +286,13 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -394,22 +340,13 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -457,22 +394,13 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ptr
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
index 37902cd..da07f15 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll
@@ -13,22 +13,13 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr ad
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -76,22 +67,13 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -142,22 +124,13 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -210,22 +183,13 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -280,22 +244,13 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(ptr ad
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -343,22 +298,13 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(pt
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -406,22 +352,13 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(pt
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -469,22 +406,13 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -532,22 +460,13 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(pt
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
index 688aaaf..7441461 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll
@@ -14,22 +14,13 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -77,22 +68,13 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -141,26 +123,15 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -209,22 +180,13 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -273,22 +235,13 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -337,22 +290,13 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -400,22 +344,13 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -463,22 +398,13 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -526,22 +452,13 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -589,22 +506,13 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
index eb5416e..59207c9 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll
@@ -15,22 +15,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -80,26 +71,15 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -150,27 +130,15 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -222,28 +190,15 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY12]], %subreg.sub1, [[COPY11]], %subreg.sub2, [[COPY10]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF9:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF10:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF11:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[COPY14:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE3]]
- ; GFX908-NEXT: [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -292,22 +247,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -356,22 +302,13 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -420,22 +357,13 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -484,22 +412,13 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -548,22 +467,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -612,22 +522,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -676,22 +577,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -740,22 +632,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -802,22 +685,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -864,22 +738,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -926,23 +791,14 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -990,22 +846,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1054,22 +901,13 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1118,24 +956,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1183,20 +1012,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1242,20 +1063,12 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1302,24 +1115,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1368,24 +1172,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1434,24 +1229,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1501,24 +1287,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1568,24 +1345,15 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0, [[COPY9]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY7]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
index 61c260e..62edf72 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll
@@ -13,12 +13,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -66,12 +61,7 @@ define <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -119,12 +109,7 @@ define <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -175,12 +160,7 @@ define half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i3
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -228,12 +208,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -281,12 +256,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -334,12 +304,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -387,12 +352,7 @@ define half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
index 8261461..45180c6 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll
@@ -13,12 +13,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -66,12 +61,7 @@ define <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -122,12 +112,7 @@ define <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -180,12 +165,7 @@ define <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -240,12 +220,7 @@ define float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(<4 x i
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -293,12 +268,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -346,12 +316,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -399,12 +364,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -452,12 +412,7 @@ define float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(<4
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
index 84f4258..78241c4 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll
@@ -14,12 +14,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -67,12 +62,7 @@ define void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -121,16 +111,9 @@ define void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -179,12 +162,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -233,12 +211,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -287,12 +260,7 @@ define void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffset(half %
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -340,12 +308,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -393,12 +356,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -446,12 +404,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -499,12 +452,7 @@ define void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(ha
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
index 63f0e43..a11bc33 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll
@@ -15,12 +15,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -70,16 +65,9 @@ define void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -130,17 +118,9 @@ define void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<3 x
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY6]], %subreg.sub2
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -192,18 +172,9 @@ define void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x
; GFX908-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY8]], %subreg.sub1, [[COPY7]], %subreg.sub2, [[COPY6]], %subreg.sub3
; GFX908-NEXT: [[COPY10:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE1]]
- ; GFX908-NEXT: [[DEF8:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -252,12 +223,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -306,12 +272,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -360,12 +321,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -414,12 +370,7 @@ define void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -468,12 +419,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -522,12 +468,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -576,12 +517,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_gl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -630,12 +566,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc(fl
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -682,12 +613,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -734,12 +660,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -786,13 +707,8 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -840,12 +756,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -894,12 +805,7 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -948,14 +854,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffs
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1003,10 +904,6 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1052,10 +949,6 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
@@ -1102,14 +995,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1158,14 +1046,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1214,14 +1097,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1271,14 +1149,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_soffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1328,14 +1201,9 @@ define void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffse
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
; GFX908-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], killed [[S_MOV_B32_]], 0, implicit $exec
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
index 893f6b1..4383686 100644
--- a/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll
@@ -12,7 +12,6 @@ define float @llvm_amdgcn_raw_buffer_load_f32(i32 %voffset, i32 %soffset) {
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -46,7 +45,6 @@ define float @llvm_amdgcn_raw_tbuffer_load_f32(i32 %voffset, i32 %soffset) {
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -80,7 +78,6 @@ define <2 x float> @llvm_amdgcn_raw_buffer_load_v2f32(i32 %voffset, i32 %soffset
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -117,7 +114,6 @@ define <2 x float> @llvm_amdgcn_raw_tbuffer_load_v2f32(i32 %voffset, i32 %soffse
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -154,7 +150,6 @@ define <3 x float> @llvm_amdgcn_raw_buffer_load_v3f32(i32 %voffset, i32 %soffset
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -193,7 +188,6 @@ define <3 x float> @llvm_amdgcn_raw_tbuffer_load_v3f32(i32 %voffset, i32 %soffse
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -232,7 +226,6 @@ define <4 x float> @llvm_amdgcn_raw_buffer_load_v4f32(i32 %voffset, i32 %soffset
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -273,7 +266,6 @@ define <4 x float> @llvm_amdgcn_raw_tbuffer_load_v4f32(i32 %voffset, i32 %soffse
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -315,7 +307,6 @@ define void @llvm_amdgcn_raw_buffer_store_f32(float %val, i32 %voffset, i32 %sof
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -349,7 +340,6 @@ define void @llvm_amdgcn_raw_tbuffer_store_f32(float %val, i32 %voffset, i32 %so
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -383,12 +373,9 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -401,7 +388,7 @@ define void @llvm_amdgcn_raw_buffer_store_v2f32(<2 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -422,12 +409,9 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -440,7 +424,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v2f32(<2 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -462,13 +446,9 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -481,7 +461,7 @@ define void @llvm_amdgcn_raw_buffer_store_v3f32(<3 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -503,13 +483,9 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -522,7 +498,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v3f32(<3 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -545,14 +521,9 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -565,7 +536,7 @@ define void @llvm_amdgcn_raw_buffer_store_v4f32(<4 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -588,14 +559,9 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset,
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -608,7 +574,7 @@ define void @llvm_amdgcn_raw_tbuffer_store_v4f32(<4 x float> %val, i32 %voffset,
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -630,7 +596,6 @@ define float @llvm_amdgcn_raw_ptr_buffer_load_f32(i32 %voffset, i32 %soffset) {
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -664,7 +629,6 @@ define float @llvm_amdgcn_raw_ptr_tbuffer_load_f32(i32 %voffset, i32 %soffset) {
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -698,7 +662,6 @@ define <2 x float> @llvm_amdgcn_raw_ptr_buffer_load_v2f32(i32 %voffset, i32 %sof
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -735,7 +698,6 @@ define <2 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v2f32(i32 %voffset, i32 %so
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -772,7 +734,6 @@ define <3 x float> @llvm_amdgcn_raw_ptr_buffer_load_v3f32(i32 %voffset, i32 %sof
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -811,7 +772,6 @@ define <3 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v3f32(i32 %voffset, i32 %so
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -850,7 +810,6 @@ define <4 x float> @llvm_amdgcn_raw_ptr_buffer_load_v4f32(i32 %voffset, i32 %sof
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -891,7 +850,6 @@ define <4 x float> @llvm_amdgcn_raw_ptr_tbuffer_load_v4f32(i32 %voffset, i32 %so
; GFX908-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -933,7 +891,6 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_f32(float %val, i32 %voffset, i32
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -967,7 +924,6 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_f32(float %val, i32 %voffset, i32
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1001,12 +957,9 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1019,7 +972,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v2f32(<2 x float> %val, i32 %voffs
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX2_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -1040,12 +993,9 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff
; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1058,7 +1008,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v2f32(<2 x float> %val, i32 %voff
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF2]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[COPY4]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -1080,13 +1030,9 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1099,7 +1045,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v3f32(<3 x float> %val, i32 %voffs
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX3_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -1121,13 +1067,9 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff
; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1140,7 +1082,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v3f32(<3 x float> %val, i32 %voff
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF3]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[COPY5]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s96) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -1163,14 +1105,9 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1183,7 +1120,7 @@ define void @llvm_amdgcn_raw_ptr_buffer_store_v4f32(<4 x float> %val, i32 %voffs
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
@@ -1206,14 +1143,9 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff
; GFX908-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX908-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX908-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; GFX908-NEXT: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
- ; GFX908-NEXT: [[DEF4:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF5:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
; GFX908-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
@@ -1226,7 +1158,7 @@ define void @llvm_amdgcn_raw_ptr_tbuffer_store_v4f32(<4 x float> %val, i32 %voff
; GFX908-NEXT: bb.2:
; GFX908-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GFX908-NEXT: {{ $}}
- ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF4]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8)
+ ; GFX908-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[COPY6]], [[COPY1]], [[DEF]], killed [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into `ptr addrspace(8) poison`, align 1, addrspace 8)
; GFX908-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
; GFX908-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
; GFX908-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index 768c972..98691d3 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -67,24 +67,8 @@ define void @v_mov_b64_double(ptr addrspace(1) %ptr) {
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_add_f64_e32 v[2:3], lit64(0x4063233333333333), v[4:5]
-; GCN-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
-; GCN-NEXT: s_wait_xcnt 0x0
-; GCN-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GCN-NEXT: s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GCN-NEXT: s_cbranch_execnz .LBB6_1
-; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GCN-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4063233333333333)
+; GCN-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_SYS
; GCN-NEXT: s_set_pc_i64 s[30:31]
%result = atomicrmw fadd ptr addrspace(1) %ptr, double 153.1 monotonic
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir
new file mode 100644
index 0000000..18aeb25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn --run-pass=livevars -o - %s | FileCheck %s
+---
+# Check that super register is defined for an sgpr copy.
+name: sgpr_copy
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: sgpr_copy
+ ; CHECK: %sval:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: $sgpr0 = COPY %sval
+ ; CHECK-NEXT: $sgpr1 = COPY %sval
+ ; CHECK-NEXT: $sgpr2 = COPY %sval
+ ; CHECK-NEXT: $sgpr3 = COPY killed %sval
+ ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+ %sval:sreg_32 = S_MOV_B32 0
+
+ $sgpr0 = COPY %sval
+ $sgpr1 = COPY %sval
+ $sgpr2 = COPY %sval
+ $sgpr3 = COPY %sval
+ SI_RETURN implicit $sgpr0_sgpr1_sgpr2_sgpr3
+
+...
+---
+# Check that super register is defined for a vgpr vector copy.
+name: vgpr_copy
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; CHECK-LABEL: name: vgpr_copy
+ ; CHECK: %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY %vval
+ ; CHECK-NEXT: $vgpr1 = COPY %vval
+ ; CHECK-NEXT: $vgpr2 = COPY %vval
+ ; CHECK-NEXT: $vgpr3 = COPY killed %vval
+ ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3
+ %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+ $vgpr0 = COPY %vval
+ $vgpr1 = COPY %vval
+ $vgpr2 = COPY %vval
+ $vgpr3 = COPY %vval
+ %0:vgpr_32 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+
+...
+---
+# Check that super register is defined when there is a hole.
+name: sgpr_copy_hole
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: sgpr_copy_hole
+ ; CHECK: %sval:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: $sgpr0 = COPY %sval
+ ; CHECK-NEXT: $sgpr2 = COPY %sval
+ ; CHECK-NEXT: $sgpr3 = COPY killed %sval
+ ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
+ %sval:sreg_32 = S_MOV_B32 0
+
+ $sgpr0 = COPY %sval
+ $sgpr2 = COPY %sval
+ $sgpr3 = COPY %sval
+ SI_RETURN implicit $sgpr0_sgpr1_sgpr2_sgpr3
+
+...
+---
+# Check that super register is defined when a pair interrupts the sequence.
+name: vgpr_copy_pair
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: vgpr_copy_pair
+ ; CHECK: %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY %vval
+ ; CHECK-NEXT: $vgpr1 = COPY %vval
+ ; CHECK-NEXT: $vgpr2 = COPY %vval
+ ; CHECK-NEXT: $vgpr3 = COPY killed %vval
+ ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3
+ %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+
+ $vgpr0 = COPY %vval
+ $vgpr1 = COPY %vval
+ $vgpr2 = COPY %vval
+ $vgpr3 = COPY %vval
+ %0:vgpr_32 = COPY $vgpr1_vgpr2
+ %1:vgpr_32 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index ceed41f..6df3d25 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
; RUN: | FileCheck -check-prefix=GCN-O3 %s
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
define void @empty() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2a5c652..36231ab 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,6 +49,7 @@
; GCN-O0-NEXT: Expand reduction intrinsics
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O0-NEXT: AMDGPU lower intrinsics
; GCN-O0-NEXT: CallGraph Construction
; GCN-O0-NEXT: Call Graph SCC Pass Manager
; GCN-O0-NEXT: DummyCGSCCPass
@@ -231,6 +232,7 @@
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O1-NEXT: AMDGPU lower intrinsics
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
; GCN-O1-NEXT: DummyCGSCCPass
@@ -255,6 +257,7 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR late optimizations
; GCN-O1-NEXT: Post-Dominator Tree Construction
+; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: Unify divergent function exit nodes
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Cycle Info Analysis
@@ -529,6 +532,7 @@
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
@@ -559,6 +563,7 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
+; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
@@ -845,6 +850,7 @@
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O2-NEXT: AMDGPU lower intrinsics
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
; GCN-O2-NEXT: DummyCGSCCPass
@@ -875,6 +881,7 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU IR late optimizations
; GCN-O2-NEXT: Post-Dominator Tree Construction
+; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: Unify divergent function exit nodes
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Cycle Info Analysis
@@ -1176,6 +1183,7 @@
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O3-NEXT: AMDGPU lower intrinsics
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
; GCN-O3-NEXT: DummyCGSCCPass
@@ -1206,6 +1214,7 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU IR late optimizations
; GCN-O3-NEXT: Post-Dominator Tree Construction
+; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: Unify divergent function exit nodes
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Cycle Info Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
index 3a55070..57967bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -27,7 +27,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s2
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -38,7 +38,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
@@ -49,7 +49,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -58,33 +58,21 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-REAL16: ; %bb.1:
-; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT: s_branch .LBB1_0
-; SDAG-REAL16-NEXT: .p2align 8
-; SDAG-REAL16-NEXT: ; %bb.2:
-; SDAG-REAL16-NEXT: .LBB1_0:
+; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s8
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
-; SDAG-FAKE16: ; %bb.1:
-; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT: s_branch .LBB1_0
-; SDAG-FAKE16-NEXT: .p2align 8
-; SDAG-FAKE16-NEXT: ; %bb.2:
-; SDAG-FAKE16-NEXT: .LBB1_0:
+; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s8
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -95,7 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
@@ -106,7 +94,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -120,7 +108,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -129,7 +117,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, 0x64
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -138,7 +126,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
@@ -147,7 +135,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, 0x64
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0
store i16 %cvt, ptr %out, align 2
@@ -163,7 +151,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -174,7 +162,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s2
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -185,7 +173,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
@@ -196,7 +184,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -205,33 +193,21 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-REAL16: ; %bb.1:
-; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-REAL16-NEXT: s_branch .LBB4_0
-; SDAG-REAL16-NEXT: .p2align 8
-; SDAG-REAL16-NEXT: ; %bb.2:
-; SDAG-REAL16-NEXT: .LBB4_0:
+; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s8
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
-; SDAG-FAKE16: ; %bb.1:
-; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0
-; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-FAKE16-NEXT: s_branch .LBB4_0
-; SDAG-FAKE16-NEXT: .p2align 8
-; SDAG-FAKE16-NEXT: ; %bb.2:
-; SDAG-FAKE16-NEXT: .LBB4_0:
+; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s8
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -242,7 +218,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
@@ -253,7 +229,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0
store i16 %cvt, ptr %out, align 2
@@ -267,7 +243,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
-; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -276,7 +252,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, 0x64
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] scope:SCOPE_SE
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -285,7 +261,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64
; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0
-; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-REAL16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
@@ -294,7 +270,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, 0x64
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] scope:SCOPE_SE
; GISEL-FAKE16-NEXT: s_endpgm
%cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0
store i16 %cvt, ptr %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index a9a6431..18afb7c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
@@ -7,123 +8,210 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
-; GCN-LABEL: {{^}}test_div_fmas_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: s_bitcmp1_b32 s{{[0-9]+}}, 0
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT: s_load_dword s7, s[4:5], 0x1c
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s2, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
-; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
-; SI: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x1c
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x25
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s2, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
-
-; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
-; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x16
+; GCN-NEXT: s_load_dword s6, s[4:5], 0xb
+; GCN-NEXT: s_load_dword s4, s[4:5], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s2, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-
-; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
-; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
-
-; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
-; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_inline_imm_2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x13
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x1c
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s2, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f64:
-; GCN: v_div_fmas_f64
define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_bitcmp1_b32 s8, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s10, -1
+; GCN-NEXT: s_mov_b32 s8, s0
+; GCN-NEXT: s_mov_b32 s9, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_endpgm
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
store double %result, ptr addrspace(1) %out, align 8
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind {
+; GCN-LABEL: test_div_fmas_f32_cond_to_vcc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_eq_u32 s3, 0
+; GCN-NEXT: s_cselect_b64 vcc, -1, 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%cmp = icmp eq i32 %i, 0
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
-; GCN: s_mov_b64 vcc, 0
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_mov_b64 vcc, 0
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
-; GCN: s_mov_b64 vcc, -1
-; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind {
+; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_mov_b64 vcc, -1
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
store float %result, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
-; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
-; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
-
-; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
-; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
-; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
-; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
-; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
-; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind {
+; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_and_b64 vcc, vcc, s[2:3]
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, s7
+; GCN-NEXT: v_div_fmas_f32 v0, v0, v3, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
+; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
%gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
@@ -143,26 +231,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
ret void
}
-; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-
-; SI: ; %entry
-; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
-; SI: s_mov_b64 vcc, 0
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
-
-; SI: ; %bb
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: s_and_b64 vcc, vcc, exec
-
-; SI: ; %exit
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI-NOT: vcc
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
-
define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind {
+; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 s[0:1], s[10:11]
+; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
+; GCN-NEXT: buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8
+; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GCN-NEXT: s_mov_b64 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[0:1]
+; GCN-NEXT: s_cbranch_execz .LBB9_2
+; GCN-NEXT: ; %bb.1: ; %bb
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_b64 vcc, vcc, exec
+; GCN-NEXT: .LBB9_2: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GCN-NEXT: s_mov_b32 s10, -1
+; GCN-NEXT: s_mov_b32 s11, s3
+; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GCN-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
@@ -188,3 +289,5 @@ exit:
store float %result, ptr addrspace(1) %gep.out, align 4
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
index 89555d3..d5fba2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -31,7 +31,7 @@ define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
; GCN-NEXT: s_endpgm
entry:
- %gep = getelementptr i32, ptr %ptr, i32 128
+ %gep = getelementptr inbounds i32, ptr %ptr, i32 128
tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
index 047a6cc..80f9eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -31,7 +31,7 @@ define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
; GCN-NEXT: global_prefetch_b8 v[0:1], off offset:512
; GCN-NEXT: s_endpgm
entry:
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 128
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 128
tail call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %gep, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index 3b4db4a..d45705e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -18,6 +18,16 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -74,6 +84,16 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_cube:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_cube:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -132,6 +152,16 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_2darray:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_2darray:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -190,6 +220,16 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -246,6 +286,16 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_cl_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_cl_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -306,6 +356,16 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_cl_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_cl_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -362,6 +422,16 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_b_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_b_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -418,6 +488,16 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_b_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_b_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -476,6 +556,16 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_b_cl_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_b_cl_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -538,6 +628,16 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_b_cl_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_b_cl_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s12, exec_lo
@@ -591,6 +691,13 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_l_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_l_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l
@@ -636,6 +743,13 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_l_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_l_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
@@ -677,6 +791,13 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_lz_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_lz_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
@@ -718,6 +839,13 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
+; GFX10-LABEL: gather4_c_lz_2d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
; GFX11-TRUE16-LABEL: gather4_c_lz_2d:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
@@ -773,5 +901,4 @@ attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
attributes #2 = { nounwind readnone }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 46e2e92..d9226df 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -358,14 +358,15 @@ main_body:
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX11-TRUE16: ; %bb.0: ; %main_body
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
-; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v2, v0.l, v2, v0.l wait_exp:7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v1 wait_exp:7
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
@@ -383,14 +384,15 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #
;
; GFX12-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX12-TRUE16: ; %bb.0: ; %main_body
-; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
-; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v2, v0.l, v2, v0.l wait_exp:7
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v1 wait_exp:7
+; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
; GFX12-TRUE16-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
index 835c924..05b786b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -5,7 +5,7 @@
; RUN: not llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; GISEL-ERR: LLVM ERROR: cannot select: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.inverse.ballot)
+; GISEL-ERR: LLVM ERROR: cannot select: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.inverse.ballot)
; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.inverse.ballot
declare i1 @llvm.amdgcn.inverse.ballot(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index c364c391..f1dcc93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -2,10 +2,12 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefixes=SI,SI-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SDAG %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CIT-LABEL: is_local_vgpr:
@@ -90,6 +92,21 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
+; GFX1250-LABEL: is_local_vgpr:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT: s_endpgm
+;
; CI-GISEL-LABEL: is_local_vgpr:
; CI-GISEL: ; %bb.0:
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -237,6 +254,23 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; GFX9-SDAG-NEXT: .LBB1_2: ; %bb1
; GFX9-SDAG-NEXT: s_endpgm
;
+; GFX1250-SDAG-LABEL: is_local_sgpr:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s2, s1
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB1_2
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %bb0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX1250-SDAG-NEXT: .LBB1_2: ; %bb1
+; GFX1250-SDAG-NEXT: s_endpgm
+;
; CI-GISEL-LABEL: is_local_sgpr:
; CI-GISEL: ; %bb.0:
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -296,6 +330,20 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: .LBB1_2: ; %bb1
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: is_local_sgpr:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX1250-GISEL-NEXT: .LBB1_2: ; %bb1
+; GFX1250-GISEL-NEXT: s_endpgm
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 462090c..0a2e7af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1,12 +1,46 @@
-; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-12,GFX12 %s
-; GCN-LABEL: {{^}}gs_const:
-; GCN-NOT: v_cmpx
-; GCN: s_mov_b64 exec, 0
define amdgpu_gs void @gs_const() {
+; SI-LABEL: gs_const:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b64 s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: gs_const:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: gs_const:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: s_mov_b64 s[0:1], exec
+; GFX11-12-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
%tmp = icmp ule i32 0, 3
%tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
%c1 = fcmp oge float %tmp1, 0.0
@@ -19,12 +53,81 @@ define amdgpu_gs void @gs_const() {
ret void
}
-; GCN-LABEL: {{^}}vcc_implicit_def:
-; GCN: v_cmp_nle_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
-; GCN: s_and{{n2|_not1}}_b64 exec, exec, vcc
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) {
+; SI-LABEL: vcc_implicit_def:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_gt_f32_e64 s[0:1], 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_cbranch_scc0 .LBB1_2
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB1_2:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: vcc_implicit_def:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
+; GFX10-NEXT: v_cmp_gt_f32_e64 s[0:1], 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB1_2:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: vcc_implicit_def:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[0:1], 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: exp mrt1 v0, v0, v0, v0 done
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB1_2:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: vcc_implicit_def:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX12-NEXT: s_mov_b64 s[2:3], exec
+; GFX12-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_and_not1_b64 s[0:1], exec, s[0:1]
+; GFX12-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[0:1]
+; GFX12-NEXT: s_cbranch_scc0 .LBB1_2
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: export mrt1 v0, v0, v0, v0 done
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB1_2:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
%tmp0 = fcmp olt float %arg13, 0.000000e+00
%c1 = fcmp oge float %arg14, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
@@ -34,31 +137,102 @@ define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) {
ret void
}
-; GCN-LABEL: {{^}}true:
-; GCN-NEXT: %bb.
-; GCN-NEXT: s_endpgm
define amdgpu_gs void @true() {
+; GCN-LABEL: true:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_endpgm
call void @llvm.amdgcn.kill(i1 true)
ret void
}
-; GCN-LABEL: {{^}}false:
-; GCN-NOT: v_cmpx
-; GCN: s_mov_b64 exec, 0
define amdgpu_gs void @false() {
+; SI-LABEL: false:
+; SI: ; %bb.0:
+; SI-NEXT: s_andn2_b64 exec, exec, exec
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: false:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_andn2_b64 exec, exec, exec
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: false:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: s_and_not1_b64 exec, exec, exec
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
call void @llvm.amdgcn.kill(i1 false)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}and:
-; GCN: v_cmp_lt_i32
-; GCN: v_cmp_lt_i32
-; GCN: s_or_b64 s[0:1]
-; GCN: s_and{{n2|_not1}}_b64 s[0:1], exec, s[0:1]
-; GCN: s_and{{n2|_not1}}_b64 s[2:3], s[2:3], s[0:1]
-; GCN: s_and_b64 exec, exec, s[2:3]
define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SI-LABEL: and:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT: s_and_b64 exec, exec, s[2:3]
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: and:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX10-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX10-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: and:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX11-12-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; GFX11-12-NEXT: s_mov_b64 s[2:3], exec
+; GFX11-12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX11-12-NEXT: s_and_not1_b64 s[0:1], exec, s[0:1]
+; GFX11-12-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[0:1]
+; GFX11-12-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
%c1 = icmp slt i32 %a, %b
%c2 = icmp slt i32 %c, %d
%x = or i1 %c1, %c2
@@ -67,13 +241,52 @@ define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
ret void
}
-; GCN-LABEL: {{^}}andn2:
-; GCN: v_cmp_lt_i32
-; GCN: v_cmp_lt_i32
-; GCN: s_xor_b64 s[0:1]
-; GCN: s_and{{n2|_not1}}_b64 s[2:3], s[2:3], s[0:1]
-; GCN: s_and_b64 exec, exec, s[2:3]
define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
+; SI-LABEL: andn2:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT: s_and_b64 exec, exec, s[2:3]
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: andn2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX10-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: andn2:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1
+; GFX11-12-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3
+; GFX11-12-NEXT: s_mov_b64 s[2:3], exec
+; GFX11-12-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
+; GFX11-12-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[0:1]
+; GFX11-12-NEXT: s_and_b64 exec, exec, s[2:3]
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
%c1 = icmp slt i32 %a, %b
%c2 = icmp slt i32 %c, %d
%x = xor i1 %c1, %c2
@@ -83,135 +296,854 @@ define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
ret void
}
-; GCN-LABEL: {{^}}oeq:
-; GCN: v_cmp_neq_f32
+; Should use v_cmp_neq_f32
define amdgpu_gs void @oeq(float %a) {
+; SI-LABEL: oeq:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: oeq:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: oeq:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: oeq:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp oeq float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ogt:
-; GCN: v_cmp_nlt_f32
+; Should use v_cmp_nlt_f32
define amdgpu_gs void @ogt(float %a) {
+; SI-LABEL: ogt:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ogt:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ogt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ogt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ogt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}oge:
-; GCN: v_cmp_nle_f32
+; Should use v_cmp_nle_f32
define amdgpu_gs void @oge(float %a) {
+; SI-LABEL: oge:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: oge:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: oge:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: oge:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp oge float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}olt:
-; GCN: v_cmp_ngt_f32
+; Should use v_cmp_ngt_f32
define amdgpu_gs void @olt(float %a) {
+; SI-LABEL: olt:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: olt:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: olt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: olt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp olt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ole:
-; GCN: v_cmp_nge_f32
+; Should use v_cmp_nge_f32
define amdgpu_gs void @ole(float %a) {
+; SI-LABEL: ole:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ole:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ole:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ole:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_ge_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ole float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}one:
-; GCN: v_cmp_nlg_f32
+; Should use v_cmp_nlg_f32
define amdgpu_gs void @one(float %a) {
+; SI-LABEL: one:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: one:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: one:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: one:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp one float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ord:
-; GCN: v_cmp_o_f32
+; Should use v_cmp_o_f32
define amdgpu_gs void @ord(float %a) {
+; SI-LABEL: ord:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_mov_b64 s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[2:3], exec, vcc
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: s_and_b64 exec, exec, s[0:1]
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ord:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_andn2_b64 s[2:3], exec, vcc
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: ord:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; GFX11-12-NEXT: s_mov_b64 s[0:1], exec
+; GFX11-12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX11-12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
%c1 = fcmp ord float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}uno:
-; GCN: v_cmp_u_f32
+; Should use v_cmp_u_f32
define amdgpu_gs void @uno(float %a) {
+; SI-LABEL: uno:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; SI-NEXT: s_mov_b64 s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[2:3], exec, vcc
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: s_and_b64 exec, exec, s[0:1]
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: uno:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_andn2_b64 s[2:3], exec, vcc
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-12-LABEL: uno:
+; GFX11-12: ; %bb.0:
+; GFX11-12-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX11-12-NEXT: s_mov_b64 s[0:1], exec
+; GFX11-12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX11-12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX11-12-NEXT: s_mov_b32 m0, 0
+; GFX11-12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-12-NEXT: s_endpgm
+; GFX11-12-NEXT: ; %bb.1:
+; GFX11-12-NEXT: s_mov_b64 exec, 0
+; GFX11-12-NEXT: s_endpgm
%c1 = fcmp uno float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ueq:
-; GCN: v_cmp_lg_f32
+; Should use v_cmp_lg_f32
define amdgpu_gs void @ueq(float %a) {
+; SI-LABEL: ueq:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ueq:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ueq:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ueq:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ueq float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ugt:
-; GCN: v_cmp_ge_f32
+; Should use v_cmp_ge_f32
define amdgpu_gs void @ugt(float %a) {
+; SI-LABEL: ugt:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ugt:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ugt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ugt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_nge_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ugt float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}uge:
-; GCN: v_cmp_gt_f32_e32 vcc, -1.0
+; Should use v_cmp_gt_f32_e32 vcc, -1.0
define amdgpu_gs void @uge(float %a) {
+; SI-LABEL: uge:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_gt_f32_e32 vcc, -1.0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: uge:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc, -1.0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: uge:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_gt_f32_e32 vcc, -1.0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: uge:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, -1.0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp uge float %a, -1.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ult:
-; GCN: v_cmp_le_f32_e32 vcc, -2.0
+; Should use v_cmp_le_f32_e32 vcc, -2.0
define amdgpu_gs void @ult(float %a) {
+; SI-LABEL: ult:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_le_f32_e32 vcc, -2.0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ult:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_le_f32_e32 vcc, -2.0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ult:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_le_f32_e32 vcc, -2.0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ult:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_nle_f32_e32 vcc, -2.0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ult float %a, -2.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}ule:
-; GCN: v_cmp_lt_f32_e32 vcc, 2.0
+; Should use v_cmp_lt_f32_e32 vcc, 2.0
define amdgpu_gs void @ule(float %a) {
+; SI-LABEL: ule:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: ule:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: ule:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: ule:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp ule float %a, 2.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}une:
-; GCN: v_cmp_eq_f32_e32 vcc, 0
+; Should use v_cmp_eq_f32_e32 vcc, 0
define amdgpu_gs void @une(float %a) {
+; SI-LABEL: une:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: une:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: une:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: une:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp une float %a, 0.0
call void @llvm.amdgcn.kill(i1 %c1)
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
ret void
}
-; GCN-LABEL: {{^}}neg_olt:
-; GCN: v_cmp_gt_f32_e32 vcc, 1.0
+; Should use v_cmp_gt_f32_e32 vcc, 1.0
define amdgpu_gs void @neg_olt(float %a) {
+; SI-LABEL: neg_olt:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_mov_b32 m0, 0
+; SI-NEXT: s_nop 0
+; SI-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; SI-NEXT: s_endpgm
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: neg_olt:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_mov_b32 m0, 0
+; GFX10-NEXT: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: neg_olt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_mov_b32 m0, 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: neg_olt:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_mov_b32 m0, 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: s_endpgm
%c1 = fcmp olt float %a, 1.0
%c2 = xor i1 %c1, 1
call void @llvm.amdgcn.kill(i1 %c2)
@@ -219,13 +1151,61 @@ define amdgpu_gs void @neg_olt(float %a) {
ret void
}
-; GCN-LABEL: {{^}}fcmp_x2:
; FIXME: LLVM should be able to combine these fcmp opcodes.
-; SI: v_cmp_lt_f32_e32 vcc, s{{[0-9]+}}, v0
-; GFX10: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
-; GCN: v_cndmask_b32
-; GCN: v_cmp_nle_f32
define amdgpu_ps void @fcmp_x2(float %a) #0 {
+; SI-LABEL: fcmp_x2:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s0, 0x3e800000
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_cbranch_scc0 .LBB21_1
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB21_1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: fcmp_x2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_cbranch_scc0 .LBB21_1
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB21_1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: fcmp_x2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_cbranch_scc0 .LBB21_1
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB21_1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: fcmp_x2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, vcc
+; GFX12-NEXT: s_and_not1_b64 s[0:1], exec, s[2:3]
+; GFX12-NEXT: s_cbranch_scc0 .LBB21_1
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB21_1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
%ogt = fcmp nsz ogt float %a, 2.500000e-01
%k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00
%c = fcmp nsz oge float %k, 0.000000e+00
@@ -234,14 +1214,78 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 {
}
; Note: an almost identical test for this exists in llvm.amdgcn.wqm.vote.ll
-; GCN-LABEL: {{^}}wqm:
-; GCN: v_cmp_neq_f32_e32 vcc, 0
-; GCN-DAG: s_wqm_b64 s[2:3], vcc
-; GCN-DAG: s_mov_b64 s[0:1], exec
-; GCN: s_and{{n2|_not1}}_b64 s[2:3], exec, s[2:3]
-; GCN: s_and{{n2|_not1}}_b64 s[0:1], s[0:1], s[2:3]
-; GCN: s_and_b64 exec, exec, s[0:1]
define amdgpu_ps float @wqm(float %a) {
+; SI-LABEL: wqm:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT: s_wqm_b64 s[2:3], vcc
+; SI-NEXT: s_mov_b64 s[0:1], exec
+; SI-NEXT: s_andn2_b64 s[2:3], exec, s[2:3]
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: s_cbranch_scc0 .LBB22_2
+; SI-NEXT: ; %bb.1:
+; SI-NEXT: s_and_b64 exec, exec, s[0:1]
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_branch .LBB22_3
+; SI-NEXT: .LBB22_2:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB22_3:
+;
+; GFX10-LABEL: wqm:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_wqm_b64 s[2:3], vcc
+; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[2:3]
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_cbranch_scc0 .LBB22_2
+; GFX10-NEXT: ; %bb.1:
+; GFX10-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_branch .LBB22_3
+; GFX10-NEXT: .LBB22_2:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB22_3:
+;
+; GFX11-LABEL: wqm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_mov_b64 s[0:1], exec
+; GFX11-NEXT: s_wqm_b64 s[2:3], vcc
+; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[2:3]
+; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT: s_cbranch_scc0 .LBB22_2
+; GFX11-NEXT: ; %bb.1:
+; GFX11-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_branch .LBB22_3
+; GFX11-NEXT: .LBB22_2:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB22_3:
+;
+; GFX12-LABEL: wqm:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_wqm_b64 s[2:3], vcc
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, s[2:3]
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_cbranch_scc0 .LBB22_2
+; GFX12-NEXT: ; %bb.1:
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_branch .LBB22_3
+; GFX12-NEXT: .LBB22_2:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB22_3:
%c1 = fcmp une float %a, 0.0
%c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
call void @llvm.amdgcn.kill(i1 %c2)
@@ -249,28 +1293,212 @@ define amdgpu_ps float @wqm(float %a) {
}
; This checks that we use the 64-bit encoding when the operand is a SGPR.
-; GCN-LABEL: {{^}}test_sgpr:
-; GCN: v_cmp_nle_f32_e64
define amdgpu_ps void @test_sgpr(float inreg %a) #0 {
+; SI-LABEL: test_sgpr:
+; SI: ; %bb.0:
+; SI-NEXT: v_cmp_nle_f32_e64 vcc, s0, 1.0
+; SI-NEXT: s_andn2_b64 exec, exec, vcc
+; SI-NEXT: s_cbranch_scc0 .LBB23_1
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB23_1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_sgpr:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_nle_f32_e64 vcc, s0, 1.0
+; GFX10-NEXT: s_andn2_b64 exec, exec, vcc
+; GFX10-NEXT: s_cbranch_scc0 .LBB23_1
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB23_1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_sgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_nle_f32_e64 vcc, s0, 1.0
+; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
+; GFX11-NEXT: s_cbranch_scc0 .LBB23_1
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB23_1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_sgpr:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cmp_le_f32 s0, 1.0
+; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX12-NEXT: s_and_not1_b64 s[0:1], exec, s[0:1]
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1]
+; GFX12-NEXT: s_cbranch_scc0 .LBB23_1
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB23_1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
%c = fcmp ole float %a, 1.000000e+00
call void @llvm.amdgcn.kill(i1 %c) #1
ret void
}
-; GCN-LABEL: {{^}}test_non_inline_imm_sgpr:
-; GCN-NOT: v_cmp_le_f32_e64
define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 {
+; SI-LABEL: test_non_inline_imm_sgpr:
+; SI: ; %bb.0:
+; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000
+; SI-NEXT: v_cmp_le_f32_e32 vcc, s0, v0
+; SI-NEXT: s_andn2_b64 s[0:1], exec, vcc
+; SI-NEXT: s_andn2_b64 s[2:3], exec, s[0:1]
+; SI-NEXT: s_cbranch_scc0 .LBB24_1
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB24_1:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_non_inline_imm_sgpr:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_ge_f32_e64 s[0:1], 0x3fc00000, s0
+; GFX10-NEXT: s_andn2_b64 s[0:1], exec, s[0:1]
+; GFX10-NEXT: s_andn2_b64 s[2:3], exec, s[0:1]
+; GFX10-NEXT: s_cbranch_scc0 .LBB24_1
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB24_1:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_non_inline_imm_sgpr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cmp_ge_f32_e64 s[0:1], 0x3fc00000, s0
+; GFX11-NEXT: s_and_not1_b64 s[0:1], exec, s[0:1]
+; GFX11-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1]
+; GFX11-NEXT: s_cbranch_scc0 .LBB24_1
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB24_1:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_non_inline_imm_sgpr:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_cmp_le_f32 s0, 0x3fc00000
+; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX12-NEXT: s_and_not1_b64 s[0:1], exec, s[0:1]
+; GFX12-NEXT: s_and_not1_b64 s[2:3], exec, s[0:1]
+; GFX12-NEXT: s_cbranch_scc0 .LBB24_1
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB24_1:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
%c = fcmp ole float %a, 1.500000e+00
call void @llvm.amdgcn.kill(i1 %c) #1
ret void
}
-; GCN-LABEL: {{^}}test_scc_liveness:
-; GCN: s_cmp
-; GCN: s_and_b64 exec
-; GCN: s_cmp
-; GCN: s_cbranch_scc
define amdgpu_ps void @test_scc_liveness() #0 {
+; SI-LABEL: test_scc_liveness:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_mov_b64 s[0:1], exec
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: .LBB25_1: ; %loop3
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_cmp_gt_i32 s2, 0
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
+; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT: s_cbranch_scc0 .LBB25_4
+; SI-NEXT: ; %bb.2: ; %loop3
+; SI-NEXT: ; in Loop: Header=BB25_1 Depth=1
+; SI-NEXT: s_and_b64 exec, exec, s[0:1]
+; SI-NEXT: s_add_i32 s3, s2, 1
+; SI-NEXT: s_cmp_lt_i32 s2, 1
+; SI-NEXT: s_mov_b32 s2, s3
+; SI-NEXT: s_cbranch_scc1 .LBB25_1
+; SI-NEXT: ; %bb.3: ; %endloop15
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB25_4:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: test_scc_liveness:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b64 s[0:1], exec
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: .LBB25_1: ; %loop3
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_cmp_gt_i32 s2, 0
+; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX10-NEXT: s_andn2_b64 s[4:5], exec, s[4:5]
+; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT: s_cbranch_scc0 .LBB25_4
+; GFX10-NEXT: ; %bb.2: ; %loop3
+; GFX10-NEXT: ; in Loop: Header=BB25_1 Depth=1
+; GFX10-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX10-NEXT: s_add_i32 s3, s2, 1
+; GFX10-NEXT: s_cmp_lt_i32 s2, 1
+; GFX10-NEXT: s_mov_b32 s2, s3
+; GFX10-NEXT: s_cbranch_scc1 .LBB25_1
+; GFX10-NEXT: ; %bb.3: ; %endloop15
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB25_4:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_scc_liveness:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_mov_b64 s[0:1], exec
+; GFX11-NEXT: s_mov_b32 s2, 0
+; GFX11-NEXT: .LBB25_1: ; %loop3
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_cmp_gt_i32 s2, 0
+; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX11-NEXT: s_and_not1_b64 s[4:5], exec, s[4:5]
+; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX11-NEXT: s_cbranch_scc0 .LBB25_4
+; GFX11-NEXT: ; %bb.2: ; %loop3
+; GFX11-NEXT: ; in Loop: Header=BB25_1 Depth=1
+; GFX11-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX11-NEXT: s_add_i32 s3, s2, 1
+; GFX11-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-NEXT: s_mov_b32 s2, s3
+; GFX11-NEXT: s_cbranch_scc1 .LBB25_1
+; GFX11-NEXT: ; %bb.3: ; %endloop15
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB25_4:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_scc_liveness:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_mov_b64 s[0:1], exec
+; GFX12-NEXT: s_mov_b32 s2, 0
+; GFX12-NEXT: .LBB25_1: ; %loop3
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_cmp_gt_i32 s2, 0
+; GFX12-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX12-NEXT: s_and_not1_b64 s[4:5], exec, s[4:5]
+; GFX12-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_cbranch_scc0 .LBB25_4
+; GFX12-NEXT: ; %bb.2: ; %loop3
+; GFX12-NEXT: ; in Loop: Header=BB25_1 Depth=1
+; GFX12-NEXT: s_and_b64 exec, exec, s[0:1]
+; GFX12-NEXT: s_add_co_i32 s3, s2, 1
+; GFX12-NEXT: s_cmp_lt_i32 s2, 1
+; GFX12-NEXT: s_mov_b32 s2, s3
+; GFX12-NEXT: s_cbranch_scc1 .LBB25_1
+; GFX12-NEXT: ; %bb.3: ; %endloop15
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB25_4:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
main_body:
br label %loop3
@@ -287,11 +1515,139 @@ endloop15: ; preds = %loop3
; Check this compiles.
; If kill is marked as defining VCC then this will fail with live interval issues.
-; GCN-LABEL: {{^}}kill_with_loop_exit:
-; GCN: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
-; GCN: s_and{{n2|_not1}}_b64 [[LIVE]], [[LIVE]], exec
-; GCN-NEXT: s_cbranch_scc0
define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) {
+; SI-LABEL: kill_with_loop_exit:
+; SI: ; %bb.0: ; %.entry
+; SI-NEXT: v_mov_b32_e32 v0, 0x43000000
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
+; SI-NEXT: v_cmp_lt_f32_e64 s[0:1], s1, v0
+; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-NEXT: s_cbranch_vccnz .LBB26_5
+; SI-NEXT: ; %bb.1: ; %.preheader1.preheader
+; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-NEXT: s_mov_b64 s[2:3], exec
+; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000
+; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; SI-NEXT: .LBB26_2: ; %bb
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
+; SI-NEXT: v_add_f32_e32 v0, 0x3e800000, v0
+; SI-NEXT: s_cbranch_vccnz .LBB26_2
+; SI-NEXT: ; %bb.3: ; %bb33
+; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cbranch_scc0 .LBB26_6
+; SI-NEXT: ; %bb.4: ; %bb33
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: .LBB26_5: ; %bb35
+; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB26_6:
+; SI-NEXT: s_mov_b64 exec, 0
+; SI-NEXT: exp null off, off, off, off done vm
+; SI-NEXT: s_endpgm
+;
+; GFX10-LABEL: kill_with_loop_exit:
+; GFX10: ; %bb.0: ; %.entry
+; GFX10-NEXT: v_cmp_gt_f32_e64 s[4:5], 0x43000000, s0
+; GFX10-NEXT: v_cmp_gt_f32_e64 s[0:1], 0x43000000, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX10-NEXT: s_cbranch_vccnz .LBB26_5
+; GFX10-NEXT: ; %bb.1: ; %.preheader1.preheader
+; GFX10-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x3fc00000
+; GFX10-NEXT: s_mov_b64 s[2:3], exec
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX10-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; GFX10-NEXT: .LBB26_2: ; %bb
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_add_f32_e32 v0, 0x3e800000, v0
+; GFX10-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX10-NEXT: s_cbranch_vccnz .LBB26_2
+; GFX10-NEXT: ; %bb.3: ; %bb33
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-NEXT: s_cbranch_scc0 .LBB26_6
+; GFX10-NEXT: ; %bb.4: ; %bb33
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: .LBB26_5: ; %bb35
+; GFX10-NEXT: exp mrt0 v0, v0, v0, v0 done vm
+; GFX10-NEXT: s_endpgm
+; GFX10-NEXT: .LBB26_6:
+; GFX10-NEXT: s_mov_b64 exec, 0
+; GFX10-NEXT: exp null off, off, off, off done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: kill_with_loop_exit:
+; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[4:5], 0x43000000, s0
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[0:1], 0x43000000, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX11-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
+; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX11-NEXT: s_cbranch_vccnz .LBB26_5
+; GFX11-NEXT: ; %bb.1: ; %.preheader1.preheader
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x3fc00000
+; GFX11-NEXT: s_mov_b64 s[2:3], exec
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; GFX11-NEXT: .LBB26_2: ; %bb
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0
+; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX11-NEXT: s_cbranch_vccnz .LBB26_2
+; GFX11-NEXT: ; %bb.3: ; %bb33
+; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec
+; GFX11-NEXT: s_cbranch_scc0 .LBB26_6
+; GFX11-NEXT: ; %bb.4: ; %bb33
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: .LBB26_5: ; %bb35
+; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done
+; GFX11-NEXT: s_endpgm
+; GFX11-NEXT: .LBB26_6:
+; GFX11-NEXT: s_mov_b64 exec, 0
+; GFX11-NEXT: exp mrt0 off, off, off, off done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: kill_with_loop_exit:
+; GFX12: ; %bb.0: ; %.entry
+; GFX12-NEXT: s_cmp_lt_f32 s0, 0x43000000
+; GFX12-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX12-NEXT: s_cmp_lt_f32 s1, 0x43000000
+; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX12-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: s_mov_b32 s4, 1.0
+; GFX12-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX12-NEXT: s_cbranch_vccnz .LBB26_5
+; GFX12-NEXT: ; %bb.1: ; %.preheader1.preheader
+; GFX12-NEXT: s_cmp_ngt_f32 s6, 0
+; GFX12-NEXT: s_mov_b64 s[2:3], exec
+; GFX12-NEXT: s_mov_b32 s4, 0x3fc00000
+; GFX12-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX12-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX12-NEXT: .LBB26_2: ; %bb
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_add_f32 s4, s4, 0x3e800000
+; GFX12-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX12-NEXT: s_cbranch_vccnz .LBB26_2
+; GFX12-NEXT: ; %bb.3: ; %bb33
+; GFX12-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec
+; GFX12-NEXT: s_cbranch_scc0 .LBB26_6
+; GFX12-NEXT: ; %bb.4: ; %bb33
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: .LBB26_5: ; %bb35
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: export mrt0 v0, v0, v0, v0 done
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB26_6:
+; GFX12-NEXT: s_mov_b64 exec, 0
+; GFX12-NEXT: export mrt0 off, off, off, off done
+; GFX12-NEXT: s_endpgm
.entry:
%tmp24 = fcmp olt float %inp0, 1.280000e+02
%tmp25 = fcmp olt float %inp1, 1.280000e+02
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
index 017d402..3377290e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
@@ -104,7 +104,7 @@ define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) {
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_endpgm
entry:
- %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %gep = getelementptr inbounds i64, ptr addrspace(0) %addr, i32 4
%val = call i32 @llvm.amdgcn.flat.load.monitor.b32.i32(ptr addrspace(0) %gep, i32 10)
store i32 %val, ptr addrspace(1) %use
ret void
@@ -118,7 +118,7 @@ define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) {
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_endpgm
entry:
- %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %gep = getelementptr inbounds i64, ptr addrspace(0) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.flat.load.monitor.b64.v2i32(ptr addrspace(0) %gep, i32 22)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use)
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
- %gep = getelementptr i64, ptr addrspace(0) %addr, i32 4
+ %gep = getelementptr inbounds i64, ptr addrspace(0) %addr, i32 4
%val = call <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr addrspace(0) %gep, i32 27)
store <4 x i32> %val, ptr addrspace(1) %use
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 8081a15..284ced1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -39,16 +39,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mov_b32_e32 v18, s18
-; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -112,16 +112,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: v_accvgpr_write_b32 a13, s21
; GCN-NEXT: v_accvgpr_write_b32 a14, s22
; GCN-NEXT: v_accvgpr_write_b32 a15, s23
-; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: v_mov_b32_e32 v16, s16
+; GCN-NEXT: v_mov_b32_e32 v17, s17
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT: v_mov_b32_e32 v18, s18
+; GCN-NEXT: v_mov_b32_e32 v19, s19
; GCN-NEXT: v_mov_b32_e32 v0, s20
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v2, s22
; GCN-NEXT: v_mov_b32_e32 v3, s23
-; GCN-NEXT: v_mov_b32_e32 v17, s17
-; GCN-NEXT: v_mov_b32_e32 v18, s18
-; GCN-NEXT: v_mov_b32_e32 v19, s19
+; GCN-NEXT: v_mov_b64_e32 v[14:15], 0
; GCN-NEXT: s_nop 4
; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index d81ec1c..078a043 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -406,16 +406,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: s_nop 4
; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -449,9 +449,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -474,31 +474,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: s_nop 4
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -530,16 +533,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mov_b32_e32 v18, s18
-; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: s_nop 4
; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
@@ -589,12 +592,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
@@ -605,12 +608,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
-; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
@@ -788,16 +791,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
; SDAG-NEXT: v_mov_b32_e32 v0, s20
; SDAG-NEXT: v_mov_b32_e32 v1, s21
; SDAG-NEXT: v_mov_b32_e32 v2, s22
; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
; SDAG-NEXT: s_nop 4
; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -831,9 +834,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -856,31 +859,34 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: s_nop 4
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -912,16 +918,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
; HEURRC-NEXT: v_mov_b32_e32 v0, s20
; HEURRC-NEXT: v_mov_b32_e32 v1, s21
; HEURRC-NEXT: v_mov_b32_e32 v2, s22
; HEURRC-NEXT: v_mov_b32_e32 v3, s23
-; HEURRC-NEXT: v_mov_b32_e32 v17, s17
-; HEURRC-NEXT: v_mov_b32_e32 v18, s18
-; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
; HEURRC-NEXT: s_nop 4
; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
@@ -971,12 +977,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: v_mov_b32_e32 v48, s16
-; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v49, s17
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: v_mov_b32_e32 v50, s18
; VGPRRC-NEXT: v_mov_b32_e32 v51, s19
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
@@ -987,12 +993,12 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
-; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
@@ -2978,47 +2984,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
-; SDAG-NEXT: v_mov_b32_e32 v13, s29
-; SDAG-NEXT: v_mov_b32_e32 v14, s30
-; SDAG-NEXT: v_mov_b32_e32 v15, s31
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
+; SDAG-NEXT: v_mov_b32_e32 v8, s28
+; SDAG-NEXT: v_mov_b32_e32 v9, s29
+; SDAG-NEXT: v_mov_b32_e32 v10, s30
+; SDAG-NEXT: v_mov_b32_e32 v11, s31
+; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; SDAG-NEXT: s_nop 5
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3047,9 +3053,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -3072,31 +3078,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: s_nop 4
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3106,47 +3115,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v8, s24
-; HEURRC-NEXT: v_mov_b32_e32 v9, s25
-; HEURRC-NEXT: v_mov_b32_e32 v10, s26
-; HEURRC-NEXT: v_mov_b32_e32 v11, s27
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v12, s28
-; HEURRC-NEXT: v_mov_b32_e32 v13, s29
-; HEURRC-NEXT: v_mov_b32_e32 v14, s30
-; HEURRC-NEXT: v_mov_b32_e32 v15, s31
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_mov_b32_e32 v8, s28
+; HEURRC-NEXT: v_mov_b32_e32 v9, s29
+; HEURRC-NEXT: v_mov_b32_e32 v10, s30
+; HEURRC-NEXT: v_mov_b32_e32 v11, s31
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15]
; HEURRC-NEXT: v_mov_b32_e32 v8, s16
; HEURRC-NEXT: v_mov_b32_e32 v9, s17
; HEURRC-NEXT: v_mov_b32_e32 v10, s18
; HEURRC-NEXT: v_mov_b32_e32 v11, s19
-; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_nop 5
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
@@ -3177,37 +3186,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
-; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
-; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
-; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15]
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s31
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
@@ -3226,14 +3238,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3386,47 +3398,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s24
-; SDAG-NEXT: v_mov_b32_e32 v9, s25
-; SDAG-NEXT: v_mov_b32_e32 v10, s26
-; SDAG-NEXT: v_mov_b32_e32 v11, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v12, s28
-; SDAG-NEXT: v_mov_b32_e32 v13, s29
-; SDAG-NEXT: v_mov_b32_e32 v14, s30
-; SDAG-NEXT: v_mov_b32_e32 v15, s31
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
+; SDAG-NEXT: v_mov_b32_e32 v8, s28
+; SDAG-NEXT: v_mov_b32_e32 v9, s29
+; SDAG-NEXT: v_mov_b32_e32 v10, s30
+; SDAG-NEXT: v_mov_b32_e32 v11, s31
+; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
+; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
+; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
+; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
+; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
+; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
+; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
+; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
+; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
+; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
+; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
+; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
+; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
+; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
+; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
; SDAG-NEXT: v_mov_b32_e32 v8, s16
; SDAG-NEXT: v_mov_b32_e32 v9, s17
; SDAG-NEXT: v_mov_b32_e32 v10, s18
; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; SDAG-NEXT: s_nop 5
+; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3455,9 +3467,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
@@ -3480,31 +3492,34 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: s_nop 4
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
+; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3514,47 +3529,47 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], 48
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], 32
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v8, s24
-; HEURRC-NEXT: v_mov_b32_e32 v9, s25
-; HEURRC-NEXT: v_mov_b32_e32 v10, s26
-; HEURRC-NEXT: v_mov_b32_e32 v11, s27
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v12, s28
-; HEURRC-NEXT: v_mov_b32_e32 v13, s29
-; HEURRC-NEXT: v_mov_b32_e32 v14, s30
-; HEURRC-NEXT: v_mov_b32_e32 v15, s31
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_mov_b32_e32 v8, s28
+; HEURRC-NEXT: v_mov_b32_e32 v9, s29
+; HEURRC-NEXT: v_mov_b32_e32 v10, s30
+; HEURRC-NEXT: v_mov_b32_e32 v11, s31
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], 16
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], 0
-; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
; HEURRC-NEXT: v_mov_b32_e32 v8, s16
; HEURRC-NEXT: v_mov_b32_e32 v9, s17
; HEURRC-NEXT: v_mov_b32_e32 v10, s18
; HEURRC-NEXT: v_mov_b32_e32 v11, s19
-; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_nop 5
+; HEURRC-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
@@ -3585,37 +3600,40 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], 48
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], 32
-; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], 16
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s27
-; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s28
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s29
-; VGPRRC-NEXT: v_mov_b32_e32 v46, s30
-; VGPRRC-NEXT: v_mov_b32_e32 v47, s31
-; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
-; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], 0
-; VGPRRC-NEXT: s_nop 0
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s31
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
; VGPRRC-NEXT: s_nop 7
; VGPRRC-NEXT: s_nop 3
-; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
@@ -3634,14 +3652,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT: global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT: global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 0b2818f..24af3fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -4784,8 +4784,8 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
@@ -4811,16 +4811,16 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b32_e32 v16, s1
+; GISEL-NEXT: v_mov_b32_e32 v20, s1
; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v20 op_sel_hi:[0,0,0]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
index 6a5dc8f..2daf9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll
@@ -1,6 +1,6 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
declare i32 @llvm.amdgcn.prng.b32(i32) #0
@@ -29,6 +29,13 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 {
ret void
}
+; GCN-LABEL: {{^}}prng_undef_i32:
+; SDAG-NOT: v_prng_b32
+define amdgpu_kernel void @prng_undef_i32(ptr addrspace(1) %out) #1 {
+ %prng = call i32 @llvm.amdgcn.prng.b32(i32 undef)
+ store i32 %prng, ptr addrspace(1) %out, align 4
+ ret void
+}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index 7a20b5c..a2c1545 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -1,27 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -34,23 +59,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -62,23 +106,43 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -90,23 +154,42 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -119,24 +202,44 @@ bb2:
}
define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -149,23 +252,43 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -179,23 +302,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -209,68 +351,151 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
-; CHECK-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -285,23 +510,42 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB8_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB8_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB8_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -315,25 +559,46 @@ bb2:
}
define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v1, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v1, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index 5c0e34c..d51e912 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -1,58 +1,95 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX12 %s
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen offset:24
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v1, 24, v1
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 24
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, off, s[16:19], s20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[16:19], s20 offen
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_pk_add_f16 v0, off, s[16:19], s20 offset:92
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
ret void
}
define void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; CHECK-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add_f32 v0, v1, s[16:19], s20 offen slc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index 8a6594f..1a1a1f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -6,6 +6,7 @@
; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: raw_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 89511de..eeea1456 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -3,6 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 561ec7d..6f7c001 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -1,27 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -34,23 +59,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -62,23 +106,43 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], s5 offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -90,23 +154,42 @@ bb2:
ret void
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -119,24 +202,44 @@ bb2:
}
define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_nonptr_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -149,23 +252,43 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -179,23 +302,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -209,68 +351,151 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
-; CHECK-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: .LBB7_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: .LBB7_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v1
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: .LBB7_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: .LBB7_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -285,23 +510,42 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB8_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB8_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB8_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[2:5], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -315,25 +559,46 @@ bb2:
}
define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v1, v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v1, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_ptr_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v1, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 8b6ba1a..2c3b521 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -1,104 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; FIXME: Test 90a, 940. 908 should fail to select.
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
}
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret <2 x bfloat> %ret
}
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen offset:128
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s16 offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret void
}
define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset__slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret void
}
; Test waterfall loop
define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset) #0 {
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v6
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr6
-; GFX12-NEXT: ; implicit-def: $vgpr5
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr6
+; GFX1200-NEXT: ; implicit-def: $vgpr5
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, v4 :: v_dual_mov_b32 v10, v3
+; GFX1250-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v8, v1
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, 0x80, v5
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v8
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v9
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v10
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v11
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[8:9]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[10:11]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[4:7], s3 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9_vgpr10_vgpr11
+; GFX1250-NEXT: ; implicit-def: $vgpr6
+; GFX1250-NEXT: ; implicit-def: $vgpr1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
index 8141e0d..ea8f836 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
; GFX908-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -26,15 +27,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen scope:SCOPE_SYS
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret void
}
@@ -61,15 +69,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -96,15 +111,22 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -131,15 +153,22 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret void
}
@@ -166,15 +195,22 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
index 767117d..2838740 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) #0 {
; GFX90A-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -18,16 +19,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret float %ret
}
@@ -47,16 +56,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -76,16 +93,24 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -105,16 +130,24 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s16 offset:92 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -134,16 +167,24 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
index 3540468..4dd258b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX12 %s
define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
; GFX7-LABEL: raw_ptr_buffer_load_bf16:
@@ -41,6 +42,14 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_u16 v0, off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret bfloat %val
}
@@ -82,6 +91,14 @@ define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <2 x bfloat> %val
}
@@ -125,6 +142,14 @@ define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <4 x bfloat> %val
}
@@ -178,6 +203,14 @@ define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_set_pc_i64 s[30:31]
%val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
ret <8 x bfloat> %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index e1f84dc..ec7d7d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX12 %s
define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
; GFX7-LABEL: buffer_store_bf16:
@@ -32,6 +33,11 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -65,6 +71,11 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v2bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -102,6 +113,11 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v4bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
@@ -153,6 +169,11 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_v8bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
index 199494d..1e44a09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
@@ -31,5 +31,71 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
ret void
}
+define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 {
+; SDAG-TRUE16-LABEL: rcp_bf16_constant_4:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3e80
+; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-TRUE16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: rcp_bf16_constant_4:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16(bfloat 4.0) #0
+ store bfloat %rcp, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 {
+; SDAG-TRUE16-LABEL: rcp_bf16_constant_100:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c24
+; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-TRUE16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: rcp_bf16_constant_100:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16(bfloat 100.0) #0
+ store bfloat %rcp, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @rcp_undef_bf16(ptr addrspace(1) %out) #1 {
+; SDAG-TRUE16-LABEL: rcp_undef_bf16:
+; SDAG-TRUE16: ; %bb.0:
+; SDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x7fc0
+; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-TRUE16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-TRUE16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: rcp_undef_bf16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %rcp = call bfloat @llvm.amdgcn.rcp.bf16(bfloat undef)
+ store bfloat %rcp, ptr addrspace(1) %out, align 2
+ ret void
+}
+
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 90e150c..9003251 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -98,7 +98,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT4-NEXT: s_wait_kmcnt 0x0
; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2
; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1]
-; VARIANT4-NEXT: s_wait_storecnt 0x0
; VARIANT4-NEXT: s_barrier_signal -1
; VARIANT4-NEXT: s_barrier_wait -1
; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -145,7 +144,6 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4
; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1]
; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VARIANT6-NEXT: s_wait_storecnt 0x0
; VARIANT6-NEXT: s_barrier_signal -1
; VARIANT6-NEXT: s_barrier_wait -1
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
index 651d204..248e0c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll
@@ -11,7 +11,6 @@ define i1 @func1() {
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
@@ -27,7 +26,6 @@ define i1 @func1() {
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt_gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt_gfx1250.ll
new file mode 100644
index 0000000..f010199
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt_gfx1250.ll
@@ -0,0 +1,10 @@
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.s.waitcnt
+
+define amdgpu_kernel void @test_waitcnt_builtin() {
+ call void @llvm.amdgcn.s.waitcnt(i32 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index e4a87e3..d7f057f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_doorbell:
@@ -20,6 +22,24 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: test_get_doorbell:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_get_doorbell:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128)
store i32 %ret, ptr addrspace(1) %out
ret void
@@ -43,6 +63,24 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: test_get_ddid:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_get_ddid:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129)
store i32 %ret, ptr addrspace(1) %out
ret void
@@ -58,6 +96,16 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_get_tma:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -73,6 +121,16 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_get_realtime:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -96,6 +154,24 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: test_savewave:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_savewave:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132)
store i32 %ret, ptr addrspace(1) %out
ret void
@@ -111,6 +187,16 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_get_tba:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133)
store i64 %ret, ptr addrspace(1) %out
ret void
@@ -134,6 +220,24 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: test_get_0_i32:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_get_0_i32:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_endpgm
%ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0)
store i32 %ret, ptr addrspace(1) %out
ret void
@@ -149,6 +253,16 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_get_99999_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999)
store i64 %ret, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index f6f614e..8896364 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -1,30 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -37,23 +65,43 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %addr) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_const_idx:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_const_idx:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_const_idx:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -66,26 +114,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -98,26 +168,49 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -129,26 +222,48 @@ bb2:
ret void
}
define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -161,26 +276,49 @@ bb2:
}
define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -193,26 +331,49 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s6
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -226,26 +387,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB7_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB7_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB7_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -259,77 +442,172 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 %index) {
-; CHECK-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_clause 0x1
-; CHECK-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_clause 0x1
-; CHECK-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-FAKE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_clause 0x1
-; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_clause 0x1
+; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -344,26 +622,48 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -377,28 +677,52 @@ bb2:
}
define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB10_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v2, v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB10_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB10_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v2, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB10_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v2, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index 13b28d4..9abbc06 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -6,6 +6,7 @@
; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
; GFX67-LABEL: struct_buffer_load_i8_tfe:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 9ce33c6..822016b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
@@ -21,6 +23,15 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc
; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v12, 0
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], null idxen
+; GFX12-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], null idxen th:TH_STORE_NT
+; GFX12-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], null idxen th:TH_STORE_HT
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -40,6 +51,12 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_immoffs:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen offset:42
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
ret void
@@ -55,6 +72,11 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_idx:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
ret void
@@ -76,6 +98,12 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_ofs:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
ret void
@@ -91,6 +119,11 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32)
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_both:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
ret void
@@ -108,6 +141,12 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>,
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_both_reversed:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v4
+; GFX12-NEXT: buffer_store_b128 v[0:3], v[6:7], s[0:3], null idxen offen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
ret void
@@ -139,6 +178,15 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_wait:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], null idxen
+; GFX12-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], null idxen
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
@@ -156,6 +204,11 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_x1:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -171,6 +224,11 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_x2:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -193,6 +251,15 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc
; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: buffer_store_int:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_mov_b32_e32 v7, 0
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], null idxen
+; GFX12-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], null idxen th:TH_STORE_NT
+; GFX12-NEXT: buffer_store_b32 v6, v7, s[0:3], null idxen th:TH_STORE_HT
+; GFX12-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -212,6 +279,12 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_byte:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: buffer_store_b8 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
%v2 = fptoui float %v1 to i32
%v3 = trunc i32 %v2 to i8
@@ -237,6 +310,18 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1,
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: struct_buffer_store_f16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX12-TRUE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: struct_buffer_store_f16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX12-FAKE16-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-FAKE16-NEXT: s_endpgm
%v2 = fptrunc float %v1 to half
call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
@@ -261,6 +346,11 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v2f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -288,6 +378,11 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v4f16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -304,6 +399,12 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1,
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_i16:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX12-NEXT: buffer_store_b16 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
main_body:
%v2 = fptoui float %v1 to i32
%v3 = trunc i32 %v2 to i16
@@ -329,6 +430,11 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_vif16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b32 v0, v1, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
@@ -354,6 +460,11 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16
; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_buffer_store_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], null idxen
+; GFX12-NEXT: s_endpgm
call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 8f33dd6..23db247 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -1,30 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
-; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB0_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB0_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB0_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -37,23 +65,43 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: .LBB1_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB1_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: .LBB1_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 15
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: .LBB1_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB1_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -66,26 +114,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_off:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB2_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB2_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_off:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB2_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_off:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB2_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB2_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -98,26 +168,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB3_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB3_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 4 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB3_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s5, 4
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB3_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], s5 idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB3_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -129,26 +222,48 @@ bb2:
ret void
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB4_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB4_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen offset:4 dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB4_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT_RT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB4_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -161,26 +276,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_nonatomic_buffer_load_i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT: .LBB5_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; CHECK-NEXT: s_or_b32 s0, s1, s0
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execnz .LBB5_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_nonatomic_buffer_load_i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX11-NEXT: .LBB5_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_nonatomic_buffer_load_i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-NEXT: .LBB5_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_and_b32 s1, exec_lo, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s0, s1, s0
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -193,26 +331,49 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_i64:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, s6
-; CHECK-NEXT: .LBB6_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB6_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: .LBB6_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[3:4], v2, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[3:4], v[0:1]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_i64:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: .LBB6_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.zext = zext i32 %id to i64
@@ -226,26 +387,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_v2i16:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB7_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB7_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_v2i16:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB7_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 idxen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_v2i16:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB7_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null idxen th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -259,77 +442,172 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-SDAG-TRUE16: ; %bb.0: ; %bb
-; CHECK-SDAG-TRUE16-NEXT: s_clause 0x1
-; CHECK-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-SDAG-TRUE16-NEXT: s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX11-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-SDAG-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-FAKE16: ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX11-GISEL: ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-GISEL-NEXT: .LBB8_1: ; %bb1
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX11-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %bb2
+; GFX11-GISEL-NEXT: s_endpgm
;
-; CHECK-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-FAKE16: ; %bb.0: ; %bb
-; CHECK-FAKE16-NEXT: s_clause 0x1
-; CHECK-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-FAKE16-NEXT: s_mov_b32 s4, 0
-; CHECK-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-FAKE16-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-FAKE16-NEXT: .LBB8_1: ; %bb1
-; CHECK-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; CHECK-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-FAKE16-NEXT: ; %bb.2: ; %bb2
-; CHECK-FAKE16-NEXT: s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-SDAG-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-SDAG-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-SDAG-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-SDAG-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-SDAG-TRUE16-NEXT: s_endpgm
;
-; CHECK-GISEL-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK-GISEL: ; %bb.0: ; %bb
-; CHECK-GISEL-NEXT: s_clause 0x1
-; CHECK-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-GISEL-NEXT: s_mov_b32 s4, 0
-; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-GISEL-NEXT: .LBB8_1: ; %bb1
-; CHECK-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-GISEL-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s5, v2
-; CHECK-GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; CHECK-GISEL-NEXT: s_pack_ll_b32_b16 s5, s5, s6
-; CHECK-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; CHECK-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
-; CHECK-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-GISEL-NEXT: s_cbranch_execnz .LBB8_1
-; CHECK-GISEL-NEXT: ; %bb.2: ; %bb2
-; CHECK-GISEL-NEXT: s_endpgm
+; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-FAKE16: ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT: s_clause 0x1
+; GFX12-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-FAKE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-FAKE16-NEXT: s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-GISEL-TRUE16-NEXT: .LBB8_1: ; %bb1
+; GFX12-GISEL-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-TRUE16-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-GISEL-TRUE16-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-TRUE16-NEXT: ; %bb.2: ; %bb2
+; GFX12-GISEL-TRUE16-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -344,26 +622,48 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i32:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB9_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB9_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_v4i32:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB9_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_v4i32:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB9_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b128 v[2:5], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
@@ -377,28 +677,52 @@ bb2:
}
define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_ptr:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT: s_mov_b32 s4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, s6
-; CHECK-NEXT: .LBB10_1: ; %bb1
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_load_b32 v2, v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT: s_cbranch_execnz .LBB10_1
-; CHECK-NEXT: ; %bb.2: ; %bb2
-; CHECK-NEXT: s_endpgm
+; GFX11-LABEL: struct_ptr_atomic_buffer_load_ptr:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: .LBB10_1: ; %bb1
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_b32 v2, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-NEXT: ; %bb.2: ; %bb2
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: struct_ptr_atomic_buffer_load_ptr:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-NEXT: s_wait_xcnt 0x0
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: .LBB10_1: ; %bb1
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: flat_load_b32 v2, v[2:3]
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-NEXT: ; %bb.2: ; %bb2
+; GFX12-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 746b879..4366472 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX908-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -39,6 +40,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -75,6 +84,13 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -114,6 +130,14 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -153,6 +177,14 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -291,6 +323,42 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s3 idxen offen
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr0
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -429,6 +497,42 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s3 idxen offen
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr0
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB5_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 71c63bf..0191a85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -32,6 +33,15 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -62,6 +72,14 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -95,6 +113,15 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -128,6 +155,15 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
@@ -237,6 +273,43 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB4_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -346,6 +419,43 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
+; GFX1250-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB5_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index e3889ab..d551d91 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -4,7 +4,8 @@
; Not supported in gfx8 or gfx9
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX6-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -35,16 +36,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -78,16 +88,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[4:5], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -122,16 +141,24 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -165,16 +192,25 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -206,15 +242,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -246,15 +290,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[4:5], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -288,15 +340,22 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], s16 idxen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -328,15 +387,23 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: buffer_atomic_max_f32 v0, v[1:2], s[0:3], s16 idxen offen slc
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -442,36 +509,68 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s1, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX1200-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1200-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_add__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[4:5]
+; GFX1250-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[8:9], s[4:7], s0 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -595,41 +694,78 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr7
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_add__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_max_num_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index f001bf9..0096289 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -4,7 +4,8 @@
; Not supported in gfx8 or gfx9
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX6-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
@@ -35,16 +36,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
}
@@ -78,16 +88,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[4:5], s[0:3], s16 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -122,16 +141,24 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
}
@@ -165,16 +192,25 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
}
@@ -206,15 +242,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
@@ -246,15 +290,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen offset:256
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_add_nc_u32 v5, 0x100, v2
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[4:5], s[0:3], s16 idxen offen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret void
@@ -288,15 +340,22 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof
; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], s16 idxen
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s16 idxen
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
}
@@ -328,15 +387,23 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX11-NEXT: buffer_atomic_min_f32 v0, v[1:2], s[0:3], s16 idxen offen slc
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[2:3], s[0:3], s16 idxen offen th:TH_ATOMIC_NT
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
}
@@ -442,36 +509,68 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s1, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
+; GFX1200-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1200-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_voffset_fmin__sgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[4:5]
+; GFX1250-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s1, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[8:9], s[4:7], s0 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s1
+; GFX1250-NEXT: s_cbranch_execnz .LBB8_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
@@ -595,41 +694,78 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v1
-; GFX12-NEXT: v_readfirstlane_b32 s5, v2
-; GFX12-NEXT: v_readfirstlane_b32 s6, v3
-; GFX12-NEXT: v_readfirstlane_b32 s7, v4
-; GFX12-NEXT: v_readfirstlane_b32 s3, v7
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
-; GFX12-NEXT: ; implicit-def: $vgpr7
-; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB9_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX1200-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_mov_b32 s2, exec_lo
+; GFX1200-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
+; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
+; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1200-NEXT: s_wait_alu 0xf1ff
+; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
+; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_b32 s0, s0, s1
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
+; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; GFX1200-NEXT: ; implicit-def: $vgpr7
+; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1200-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1200-NEXT: ; %bb.2:
+; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_voffset_fmin__vgpr_soffset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v5, v4
+; GFX1250-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_add_nc_u32 v9, 0x100, v6
+; GFX1250-NEXT: s_mov_b32 s2, exec_lo
+; GFX1250-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1250-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1250-NEXT: v_readfirstlane_b32 s6, v4
+; GFX1250-NEXT: v_readfirstlane_b32 s7, v5
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
+; GFX1250-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1250-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1250-NEXT: s_and_b32 s0, s0, s1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_saveexec_b32 s0, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: buffer_atomic_min_num_f32 v0, v[8:9], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX1250-NEXT: ; implicit-def: $vgpr7
+; GFX1250-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1250-NEXT: ; %bb.2:
+; GFX1250-NEXT: s_mov_b32 exec_lo, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 7d44d91..e8b8d05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -173,7 +173,6 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0
@@ -214,9 +213,9 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_barrier
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -267,7 +266,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -293,8 +291,6 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -364,7 +360,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -390,8 +385,6 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -461,7 +454,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -487,8 +479,6 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -558,7 +548,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -584,8 +573,6 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -655,7 +642,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr2
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -681,8 +667,6 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -900,7 +884,6 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -930,8 +913,6 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -999,7 +980,6 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
@@ -1029,8 +1009,6 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr0
; GFX8-NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NOOPT-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -1121,8 +1099,6 @@ define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1211,8 +1187,6 @@ define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, dou
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NOOPT-NEXT: s_nop 1
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4
-; GFX8-NOOPT-NEXT: ; implicit-def: $sgpr4
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
; GFX8-NOOPT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index f668a116..c597693 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -1,10 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=GFX9-SDAG-ERR %s
-; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s
+; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -global-isel=1 -new-reg-bank-select < %s 2>&1 | FileCheck -check-prefix=GFX9-GISEL-ERR %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -global-isel=0 < %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefix=GFX1250 %s
; GFX9-SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.wave.id
; GFX9-GISEL-ERR: LLVM ERROR: unable to legalize instruction: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.wave.id)
@@ -17,13 +19,21 @@ define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) {
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
-; GFX12-LABEL: test_wave_id:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_endpgm
+; GFX1200-LABEL: test_wave_id:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
+; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: v_mov_b32_e32 v2, s0
+; GFX1200-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1200-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_wave_id:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_endpgm
%waveid = call i32 @llvm.amdgcn.wave.id()
store i32 %waveid, ptr addrspace(1) %out
ret void
@@ -39,6 +49,28 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1200-LABEL: test_wave_id_callable:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1200-NEXT: s_wait_expcnt 0x0
+; GFX1200-NEXT: s_wait_samplecnt 0x0
+; GFX1200-NEXT: s_wait_bvhcnt 0x0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: v_mov_b32_e32 v2, s0
+; GFX1200-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1200-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_wave_id_callable:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
; GFX12-LABEL: test_wave_id_callable:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
index 1015b75..0948530 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
@@ -6,6 +6,23 @@
declare bfloat @llvm.cos.bf16(bfloat) #0
+define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+; GCN-LABEL: cos_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0x3e230000
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0]
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_cos_bf16_e32 v0, v0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %cos = call bfloat @llvm.cos.bf16(bfloat %src) #0
+ store bfloat %cos, ptr addrspace(1) %out, align 2
+ ret void
+}
+
define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
; GCN-LABEL: cos_bf16_constant_4:
; GCN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 7151fee..af79c91 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -3227,72 +3227,6 @@ define float @v_exp_f32_fast(float %in) {
ret float %result
}
-define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; GCN-SDAG-LABEL: v_exp_f32_approx_fn_attr:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50
-; GCN-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0
-; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp_f32_approx_fn_attr:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50
-; GCN-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50
-; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50
-; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_exp_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_exp_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.exp.f32(float %in)
- ret float %result
-}
-
define float @v_exp_f32_ninf(float %in) {
; VI-SDAG-LABEL: v_exp_f32_ninf:
; VI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 918b1b2..a99c199 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -3235,78 +3235,6 @@ define float @v_exp10_f32_fast(float %in) {
ret float %result
}
-define float @v_exp10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; GCN-SDAG-LABEL: v_exp10_f32_approx_fn_attr:
-; GCN-SDAG: ; %bb.0:
-; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc217b818
-; GCN-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0
-; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
-; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
-; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0
-; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN-GISEL-LABEL: v_exp10_f32_approx_fn_attr:
-; GCN-GISEL: ; %bb.0:
-; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50
-; GCN-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0
-; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-SDAG-LABEL: v_exp10_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xc217b818
-; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp10_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50
-; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_exp10_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_exp10_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.exp10.f32(float %in)
- ret float %result
-}
-
define float @v_exp10_f32_ninf(float %in) {
; VI-SDAG-LABEL: v_exp10_f32_ninf:
; VI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index e71ea50..883db20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1583,104 +1583,6 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
ret float %result
}
-define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; SI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_not_b32_e32 v1, 63
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; SI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_not_b32_e32 v1, 63
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000
-; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
-; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_not_b32_e32 v1, 63
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_not_b32_e32 v1, 63
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0xc2fc0000
-; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_not_b32_e32 v1, 63
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2fc0000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x42800000
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_exp2_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_exp2_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.exp2.f32(float %in)
- ret float %result
-}
-
define float @v_exp2_f32_ninf(float %in) {
; SI-SDAG-LABEL: v_exp2_f32_ninf:
; SI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 307fa89..b5038c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -3076,121 +3076,6 @@ define float @v_log_f32_fast(float %in) {
ret float %result
}
-define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; SI-SDAG-LABEL: v_log_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317218
-; SI-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317218
-; SI-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log_f32_approx_fn_attr:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log_f32_approx_fn_attr:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log_f32_approx_fn_attr:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317218
-; GFX900-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log_f32_approx_fn_attr:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317218
-; GFX900-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log_f32_approx_fn_attr:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_f32_approx_fn_attr:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v0, 0x3f317218, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_log_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_log_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.log.f32(float %in)
- ret float %result
-}
-
define float @v_log_f32_ninf(float %in) {
; SI-SDAG-LABEL: v_log_f32_ninf:
; SI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5278589..7465b49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -3076,121 +3076,6 @@ define float @v_log10_f32_fast(float %in) {
ret float %result
}
-define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; SI-SDAG-LABEL: v_log10_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209b
-; SI-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209b
-; SI-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log10_f32_approx_fn_attr:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log10_f32_approx_fn_attr:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_log_f32_e32 v2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2
-; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log10_f32_approx_fn_attr:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209b
-; GFX900-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log10_f32_approx_fn_attr:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209b
-; GFX900-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log10_f32_approx_fn_attr:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_f32_approx_fn_attr:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v0
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v0, 0x3e9a209b, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_log10_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_log10_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.log10.f32(float %in)
- ret float %result
-}
-
define float @v_log10_f32_ninf(float %in) {
; SI-SDAG-LABEL: v_log10_f32_ninf:
; SI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 4ca612a..0854134 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -2030,129 +2030,6 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
ret float %result
}
-define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
-; SI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
-; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; VI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; VI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000
-; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX900-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
-; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
-; GFX1100-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
-; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; R600-LABEL: v_log2_f32_approx_fn_attr:
-; R600: ; %bb.0:
-; R600-NEXT: CF_END
-; R600-NEXT: PAD
-;
-; CM-LABEL: v_log2_f32_approx_fn_attr:
-; CM: ; %bb.0:
-; CM-NEXT: CF_END
-; CM-NEXT: PAD
- %result = call float @llvm.log2.f32(float %in)
- ret float %result
-}
-
define float @v_log2_f32_ninf(float %in) {
; SI-SDAG-LABEL: v_log2_f32_ninf:
; SI-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index af914bd..355f77a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,12 +76,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_movk_i32 s4, 0xfc01
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4
+; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v5, v5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
index 701f54b..fe8ace5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
@@ -6,6 +6,23 @@
declare bfloat @llvm.sin.bf16(bfloat) #0
+define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+; GCN-LABEL: sin_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GCN-NEXT: s_mov_b32 s3, 0x3e230000
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0]
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_sin_bf16_e32 v0, v0
+; GCN-NEXT: global_store_b16 v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
+ %sin = call bfloat @llvm.sin.bf16(bfloat %src) #0
+ store bfloat %sin, ptr addrspace(1) %out, align 2
+ ret void
+}
+
define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
; GCN-LABEL: sin_bf16_constant_4:
; GCN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 2366e39..66b01a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -16,7 +16,7 @@
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @sin_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @sin_f32(ptr addrspace(1) %out, float %x) {
%sin = call float @llvm.sin.f32(float %x)
store float %sin, ptr addrspace(1) %out
ret void
@@ -29,7 +29,7 @@ define amdgpu_kernel void @sin_f32(ptr addrspace(1) %out, float %x) #1 {
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @safe_sin_3x_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_3x_f32(ptr addrspace(1) %out, float %x) {
%y = fmul float 3.0, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -44,9 +44,9 @@ define amdgpu_kernel void @safe_sin_3x_f32(ptr addrspace(1) %out, float %x) #1 {
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @unsafe_sin_3x_f32(ptr addrspace(1) %out, float %x) #2 {
- %y = fmul float 3.0, %x
- %sin = call float @llvm.sin.f32(float %y)
+define amdgpu_kernel void @unsafe_sin_3x_f32(ptr addrspace(1) %out, float %x) {
+ %y = fmul reassoc float 3.0, %x
+ %sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
ret void
}
@@ -59,7 +59,7 @@ define amdgpu_kernel void @unsafe_sin_3x_f32(ptr addrspace(1) %out, float %x) #2
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @fmf_sin_3x_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @fmf_sin_3x_f32(ptr addrspace(1) %out, float %x) {
%y = fmul reassoc float 3.0, %x
%sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -73,7 +73,7 @@ define amdgpu_kernel void @fmf_sin_3x_f32(ptr addrspace(1) %out, float %x) #1 {
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @safe_sin_2x_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_2x_f32(ptr addrspace(1) %out, float %x) {
%y = fmul float 2.0, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -88,9 +88,9 @@ define amdgpu_kernel void @safe_sin_2x_f32(ptr addrspace(1) %out, float %x) #1 {
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @unsafe_sin_2x_f32(ptr addrspace(1) %out, float %x) #2 {
- %y = fmul float 2.0, %x
- %sin = call float @llvm.sin.f32(float %y)
+define amdgpu_kernel void @unsafe_sin_2x_f32(ptr addrspace(1) %out, float %x) {
+ %y = fmul reassoc float 2.0, %x
+ %sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
ret void
}
@@ -103,7 +103,7 @@ define amdgpu_kernel void @unsafe_sin_2x_f32(ptr addrspace(1) %out, float %x) #2
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @fmf_sin_2x_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @fmf_sin_2x_f32(ptr addrspace(1) %out, float %x) {
%y = fmul reassoc float 2.0, %x
%sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -117,7 +117,7 @@ define amdgpu_kernel void @fmf_sin_2x_f32(ptr addrspace(1) %out, float %x) #1 {
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @safe_sin_cancel_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_cancel_f32(ptr addrspace(1) %out, float %x) {
%y = fmul float 0x401921FB60000000, %x
%sin = call float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -131,9 +131,9 @@ define amdgpu_kernel void @safe_sin_cancel_f32(ptr addrspace(1) %out, float %x)
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @unsafe_sin_cancel_f32(ptr addrspace(1) %out, float %x) #2 {
- %y = fmul float 0x401921FB60000000, %x
- %sin = call float @llvm.sin.f32(float %y)
+define amdgpu_kernel void @unsafe_sin_cancel_f32(ptr addrspace(1) %out, float %x) {
+ %y = fmul reassoc float 0x401921FB60000000, %x
+ %sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
ret void
}
@@ -145,7 +145,7 @@ define amdgpu_kernel void @unsafe_sin_cancel_f32(ptr addrspace(1) %out, float %x
; GFX9-NOT: v_fract_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @fmf_sin_cancel_f32(ptr addrspace(1) %out, float %x) #1 {
+define amdgpu_kernel void @fmf_sin_cancel_f32(ptr addrspace(1) %out, float %x) {
%y = fmul reassoc float 0x401921FB60000000, %x
%sin = call reassoc float @llvm.sin.f32(float %y)
store float %sin, ptr addrspace(1) %out
@@ -164,7 +164,7 @@ define amdgpu_kernel void @fmf_sin_cancel_f32(ptr addrspace(1) %out, float %x) #
; GCN: v_sin_f32
; GCN: v_sin_f32
; GCN-NOT: v_sin_f32
-define amdgpu_kernel void @sin_v4f32(ptr addrspace(1) %out, <4 x float> %vx) #1 {
+define amdgpu_kernel void @sin_v4f32(ptr addrspace(1) %out, <4 x float> %vx) {
%sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
store <4 x float> %sin, ptr addrspace(1) %out
ret void
@@ -174,5 +174,3 @@ declare float @llvm.sin.f32(float) #0
declare <4 x float> @llvm.sin.v4f32(<4 x float>) #0
attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
index 91a8446..13ea8b0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
@@ -18,10 +18,9 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_or_b32 s1, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s0, s0, s1
-; GFX11-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or3_b32 v2, s2, v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
@@ -34,14 +33,12 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_or_b32 s0, s0, s1
-; GFX12-NEXT: s_or_b32 s0, s2, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index d59f72a..5b22135 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
; GFX6-LABEL: constant_load_i1:
@@ -74,6 +75,18 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load i1, ptr addrspace(4) %in
store i1 %load, ptr addrspace(1) %out
ret void
@@ -145,6 +158,16 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v2i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
store <2 x i1> %load, ptr addrspace(1) %out
ret void
@@ -215,6 +238,16 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v3i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
store <3 x i1> %load, ptr addrspace(1) %out
ret void
@@ -286,6 +319,16 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v4i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
store <4 x i1> %load, ptr addrspace(1) %out
ret void
@@ -357,6 +400,16 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v8i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
store <8 x i1> %load, ptr addrspace(1) %out
ret void
@@ -428,6 +481,16 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v16i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
store <16 x i1> %load, ptr addrspace(1) %out
ret void
@@ -483,6 +546,16 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v32i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
store <32 x i1> %load, ptr addrspace(1) %out
ret void
@@ -541,6 +614,17 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v64i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
store <64 x i1> %load, ptr addrspace(1) %out
ret void
@@ -602,6 +686,16 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i1_to_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -669,6 +763,18 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i1_to_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -731,6 +837,16 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -798,6 +914,18 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -871,6 +999,19 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v1, 1, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -944,6 +1085,19 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -1027,6 +1181,21 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v3, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_bitop2_b32 v0, 1, v1 bitop3:0x40
+; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -1109,6 +1278,20 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -1192,6 +1375,22 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 3, v3
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v4, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX1250-NEXT: v_bfe_u32 v2, v1, 2, 1
+; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1278,6 +1477,22 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s5
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1403,6 +1618,31 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v8, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10005
+; GFX1250-NEXT: s_and_b32 s6, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10004
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 7, v0
+; GFX1250-NEXT: v_bfe_u32 v2, v0, 6, 1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s4
+; GFX1250-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1524,6 +1764,30 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10005
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s9
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s6
+; GFX1250-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s4
+; GFX1250-NEXT: v_mov_b32_e32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1722,6 +1986,46 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v16, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: s_and_b32 s6, 0xffff, s2
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x1000d
+; GFX1250-NEXT: s_and_b32 s9, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s11, s6, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s12, s6, 0x1000b
+; GFX1250-NEXT: s_lshr_b32 s13, s6, 15
+; GFX1250-NEXT: s_bfe_u32 s14, s6, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s15, s6, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s16, s6, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s17, s6, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s6, s6, 0x1000e
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s8
+; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v15, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s13
+; GFX1250-NEXT: v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v6, s10
+; GFX1250-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v7, s12
+; GFX1250-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v12, s9
+; GFX1250-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s14
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -1915,6 +2219,44 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1000d
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s17
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s16
+; GFX1250-NEXT: v_dual_mov_b32 v3, s15 :: v_dual_mov_b32 v4, s14
+; GFX1250-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s12
+; GFX1250-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v8, s10
+; GFX1250-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8
+; GFX1250-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v12, s6
+; GFX1250-NEXT: v_dual_mov_b32 v13, s5 :: v_dual_mov_b32 v14, s4
+; GFX1250-NEXT: v_mov_b32_e32 v15, s3
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2284,6 +2626,75 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s9, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s11, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s12, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s13, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s15, s2, 0x10019
+; GFX1250-NEXT: s_lshr_b32 s16, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s17, s2, 0x1001d
+; GFX1250-NEXT: s_and_b32 s18, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s19, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s20, s2, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s21, s2, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s22, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s24, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_u32 s25, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s26, s2, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s27, s2, 0x10010
+; GFX1250-NEXT: s_bfe_u32 s28, s2, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s29, s2, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s30, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s31, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s33, s2, 0x10018
+; GFX1250-NEXT: s_bfe_u32 s34, s2, 0x1001c
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1001e
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s33
+; GFX1250-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v6, s31
+; GFX1250-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v8, s30
+; GFX1250-NEXT: v_dual_mov_b32 v9, s29 :: v_dual_mov_b32 v10, s28
+; GFX1250-NEXT: v_mov_b32_e32 v11, s13
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s27 :: v_dual_mov_b32 v1, s12
+; GFX1250-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s11
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, s25 :: v_dual_mov_b32 v5, s10
+; GFX1250-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v13, s8
+; GFX1250-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s7
+; GFX1250-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s6
+; GFX1250-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s5
+; GFX1250-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s4
+; GFX1250-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2686,6 +3097,75 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_i32 s18, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s19, s2, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s20, s2, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s21, s2, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s22, s2, 0x10010
+; GFX1250-NEXT: s_bfe_i32 s23, s2, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s24, s2, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s25, s2, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s26, s2, 0x10014
+; GFX1250-NEXT: s_bfe_i32 s27, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s28, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s29, s2, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s30, s2, 0x10018
+; GFX1250-NEXT: s_ashr_i32 s31, s2, 31
+; GFX1250-NEXT: s_bfe_i32 s33, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s34, s2, 0x1001c
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1001d
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s33
+; GFX1250-NEXT: v_dual_mov_b32 v3, s31 :: v_dual_mov_b32 v4, s30
+; GFX1250-NEXT: v_dual_mov_b32 v5, s29 :: v_dual_mov_b32 v6, s28
+; GFX1250-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s26
+; GFX1250-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
+; GFX1250-NEXT: v_mov_b32_e32 v11, s23
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s21
+; GFX1250-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s19
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s17
+; GFX1250-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s15
+; GFX1250-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s13
+; GFX1250-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s11
+; GFX1250-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s9
+; GFX1250-NEXT: v_dual_mov_b32 v18, s8 :: v_dual_mov_b32 v19, s7
+; GFX1250-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s5
+; GFX1250-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3387,6 +3867,141 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s33, s3, 31
+; GFX1250-NEXT: s_bfe_u32 s34, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s65, s3, 0x1001c
+; GFX1250-NEXT: s_bfe_u32 s66, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s31, s3, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s63, s3, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s64, s3, 0x10018
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s65
+; GFX1250-NEXT: s_bfe_u32 s29, s3, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s60, s3, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s61, s3, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s62, s3, 0x10014
+; GFX1250-NEXT: v_dual_mov_b32 v1, s34 :: v_dual_mov_b32 v2, s66
+; GFX1250-NEXT: v_dual_mov_b32 v3, s33 :: v_dual_mov_b32 v4, s64
+; GFX1250-NEXT: s_bfe_u32 s27, s3, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s28, s3, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s58, s3, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s59, s3, 0x10010
+; GFX1250-NEXT: v_dual_mov_b32 v5, s31 :: v_dual_mov_b32 v6, s63
+; GFX1250-NEXT: v_dual_mov_b32 v7, s30 :: v_dual_mov_b32 v8, s62
+; GFX1250-NEXT: v_dual_mov_b32 v9, s61 :: v_dual_mov_b32 v10, s60
+; GFX1250-NEXT: v_dual_mov_b32 v11, s29 :: v_dual_mov_b32 v12, s59
+; GFX1250-NEXT: s_bfe_u32 s19, s3, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s20, s3, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s21, s3, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s22, s3, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s23, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s24, s3, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s25, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s26, s3, 0x1000d
+; GFX1250-NEXT: s_and_b32 s51, s3, 1
+; GFX1250-NEXT: s_bfe_u32 s52, s3, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s53, s3, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s54, s3, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s55, s3, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s56, s3, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s57, s3, 0x1000e
+; GFX1250-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s58
+; GFX1250-NEXT: s_bfe_u32 s3, s3, 0x1000c
+; GFX1250-NEXT: v_mov_b32_e32 v15, s27
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s26
+; GFX1250-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s25
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v4, s56 :: v_dual_mov_b32 v5, s24
+; GFX1250-NEXT: v_dual_mov_b32 v6, s55 :: v_dual_mov_b32 v7, s23
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_mov_b32_e32 v8, s54
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s9, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s11, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s12, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s13, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s15, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s16, s2, 0x10019
+; GFX1250-NEXT: s_lshr_b32 s17, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s18, s2, 0x1001d
+; GFX1250-NEXT: s_and_b32 s35, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s36, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s37, s2, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s38, s2, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s39, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s40, s2, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s41, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_u32 s42, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s43, s2, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s44, s2, 0x10010
+; GFX1250-NEXT: s_bfe_u32 s45, s2, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s46, s2, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s47, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s48, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s49, s2, 0x10018
+; GFX1250-NEXT: s_bfe_u32 s50, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1001c
+; GFX1250-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s53
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s51
+; GFX1250-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s52
+; GFX1250-NEXT: v_dual_mov_b32 v15, s19 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s50
+; GFX1250-NEXT: v_dual_mov_b32 v19, s17 :: v_dual_mov_b32 v20, s49
+; GFX1250-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s48
+; GFX1250-NEXT: v_mov_b32_e32 v23, s15
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s46
+; GFX1250-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s14
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v5, s13
+; GFX1250-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v7, s12
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v11, s10
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v13, s9
+; GFX1250-NEXT: v_dual_mov_b32 v14, s39 :: v_dual_mov_b32 v15, s8
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v16, s38 :: v_dual_mov_b32 v17, s7
+; GFX1250-NEXT: v_dual_mov_b32 v18, s37 :: v_dual_mov_b32 v19, s6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v20, s35 :: v_dual_mov_b32 v21, s5
+; GFX1250-NEXT: v_dual_mov_b32 v22, s36 :: v_dual_mov_b32 v23, s4
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4148,6 +4763,141 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s63, s3, 31
+; GFX1250-NEXT: s_bfe_i32 s64, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s65, s3, 0x1001c
+; GFX1250-NEXT: s_bfe_i32 s66, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_i32 s59, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s60, s3, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s61, s3, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s62, s3, 0x10018
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s65
+; GFX1250-NEXT: s_bfe_i32 s55, s3, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s56, s3, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s57, s3, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s58, s3, 0x10014
+; GFX1250-NEXT: v_dual_mov_b32 v1, s66 :: v_dual_mov_b32 v2, s64
+; GFX1250-NEXT: v_dual_mov_b32 v3, s63 :: v_dual_mov_b32 v4, s62
+; GFX1250-NEXT: s_bfe_i32 s51, s3, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s52, s3, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s53, s3, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s54, s3, 0x10010
+; GFX1250-NEXT: v_dual_mov_b32 v5, s61 :: v_dual_mov_b32 v6, s60
+; GFX1250-NEXT: v_dual_mov_b32 v7, s59 :: v_dual_mov_b32 v8, s58
+; GFX1250-NEXT: v_dual_mov_b32 v9, s57 :: v_dual_mov_b32 v10, s56
+; GFX1250-NEXT: v_dual_mov_b32 v11, s55 :: v_dual_mov_b32 v12, s54
+; GFX1250-NEXT: s_bfe_i32 s36, s3, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s37, s3, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s38, s3, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s39, s3, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s40, s3, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s41, s3, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s42, s3, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s43, s3, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s44, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s45, s3, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s46, s3, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s47, s3, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s48, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s49, s3, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s50, s3, 0x1000d
+; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s52
+; GFX1250-NEXT: s_bfe_i32 s3, s3, 0x1000c
+; GFX1250-NEXT: v_mov_b32_e32 v15, s51
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s50
+; GFX1250-NEXT: v_dual_mov_b32 v2, s49 :: v_dual_mov_b32 v3, s48
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v4, s47 :: v_dual_mov_b32 v5, s46
+; GFX1250-NEXT: v_dual_mov_b32 v6, s45 :: v_dual_mov_b32 v7, s44
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_mov_b32_e32 v8, s43
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s18, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_i32 s19, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s20, s2, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s21, s2, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s22, s2, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s23, s2, 0x10010
+; GFX1250-NEXT: s_bfe_i32 s24, s2, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s25, s2, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s26, s2, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s27, s2, 0x10014
+; GFX1250-NEXT: s_bfe_i32 s28, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s29, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s30, s2, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s31, s2, 0x10018
+; GFX1250-NEXT: s_ashr_i32 s33, s2, 31
+; GFX1250-NEXT: s_bfe_i32 s34, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s35, s2, 0x1001d
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1001c
+; GFX1250-NEXT: v_dual_mov_b32 v9, s42 :: v_dual_mov_b32 v10, s41
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s40 :: v_dual_mov_b32 v12, s39
+; GFX1250-NEXT: v_dual_mov_b32 v13, s38 :: v_dual_mov_b32 v14, s37
+; GFX1250-NEXT: v_dual_mov_b32 v15, s36 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s35 :: v_dual_mov_b32 v18, s34
+; GFX1250-NEXT: v_dual_mov_b32 v19, s33 :: v_dual_mov_b32 v20, s31
+; GFX1250-NEXT: v_dual_mov_b32 v21, s30 :: v_dual_mov_b32 v22, s29
+; GFX1250-NEXT: v_mov_b32_e32 v23, s28
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s27 :: v_dual_mov_b32 v1, s26
+; GFX1250-NEXT: v_dual_mov_b32 v2, s25 :: v_dual_mov_b32 v3, s24
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v4, s23 :: v_dual_mov_b32 v5, s22
+; GFX1250-NEXT: v_dual_mov_b32 v6, s21 :: v_dual_mov_b32 v7, s20
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v9, s18
+; GFX1250-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v11, s16
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s14
+; GFX1250-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v15, s12
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v16, s11 :: v_dual_mov_b32 v17, s10
+; GFX1250-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6
+; GFX1250-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s4
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4217,6 +4967,18 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i1_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4287,6 +5049,19 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i1_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4356,6 +5131,18 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4426,6 +5213,19 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4508,6 +5308,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4592,6 +5409,21 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v3, 31, v2
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4695,6 +5527,27 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v5, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX1250-NEXT: v_bfe_u32 v2, v0, 1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 2, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v3, v5
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4805,6 +5658,28 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v5, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_lshrrev_b32 v4, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_bfe_i32 v2, v4, 0, 1
+; GFX1250-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v5, v[6:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4921,6 +5796,32 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10002
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 3, v0
+; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX1250-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5044,6 +5945,29 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v9, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v9, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 3, v0 :: v_dual_lshrrev_b32 v4, 2, v0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v8, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v4, v4, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_bfe_i32 v2, v8, 0, 1
+; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v7, 31, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v4 :: v_dual_ashrrev_i32 v3, 31, v2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v9, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5208,6 +6132,33 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v12, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v12
+; GFX1250-NEXT: v_bfe_u32 v6, v12, 5, 1
+; GFX1250-NEXT: v_bfe_u32 v4, v12, 4, 1
+; GFX1250-NEXT: v_bfe_u32 v10, v12, 3, 1
+; GFX1250-NEXT: v_bfe_u32 v8, v12, 2, 1
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_lshrrev_b32 v2, 7, v0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: v_bfe_u32 v0, v0, 6, 1
+; GFX1250-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1
+; GFX1250-NEXT: v_mov_b32_e32 v15, v1
+; GFX1250-NEXT: v_bfe_u32 v14, v12, 1, 1
+; GFX1250-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5413,6 +6364,46 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v16, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v16, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v10, s3
+; GFX1250-NEXT: s_lshr_b32 s2, s3, 6
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 7
+; GFX1250-NEXT: s_lshr_b32 s6, s3, 4
+; GFX1250-NEXT: s_lshr_b32 s8, s3, 5
+; GFX1250-NEXT: s_lshr_b32 s10, s3, 2
+; GFX1250-NEXT: s_lshr_b32 s12, s3, 3
+; GFX1250-NEXT: s_lshr_b32 s14, s3, 1
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: v_bfe_i32 v12, v10, 0, 1
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX1250-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX1250-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_ashrrev_i32 v13, 31, v12
+; GFX1250-NEXT: v_mov_b32_e32 v15, s15
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5701,6 +6692,49 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v12, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff, v12
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v28, 1, v12 bitop3:0x40
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: v_bfe_u32 v0, v12, 10, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: v_bfe_u32 v2, v22, 11, 1
+; GFX1250-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: v_bfe_u32 v6, v12, 9, 1
+; GFX1250-NEXT: v_bfe_u32 v4, v22, 8, 1
+; GFX1250-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1
+; GFX1250-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_lshrrev_b32 v10, 15, v22
+; GFX1250-NEXT: v_bfe_u32 v8, v22, 14, 1
+; GFX1250-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v17, v1
+; GFX1250-NEXT: v_bfe_u32 v14, v12, 13, 1
+; GFX1250-NEXT: v_bfe_u32 v18, v12, 7, 1
+; GFX1250-NEXT: v_bfe_u32 v26, v12, 3, 1
+; GFX1250-NEXT: v_bfe_u32 v30, v12, 1, 1
+; GFX1250-NEXT: v_bfe_u32 v24, v12, 2, 1
+; GFX1250-NEXT: v_bfe_u32 v20, v12, 4, 1
+; GFX1250-NEXT: v_bfe_u32 v16, v12, 6, 1
+; GFX1250-NEXT: v_bfe_u32 v12, v12, 12, 1
+; GFX1250-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v21, v1
+; GFX1250-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_mov_b32 v25, v1
+; GFX1250-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_mov_b32 v29, v1
+; GFX1250-NEXT: v_bfe_u32 v22, v22, 5, 1
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
+; GFX1250-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6063,6 +7097,75 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v32, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v32, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v28, s3
+; GFX1250-NEXT: s_lshr_b32 s2, s3, 14
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 15
+; GFX1250-NEXT: s_lshr_b32 s10, s3, 10
+; GFX1250-NEXT: s_lshr_b32 s12, s3, 11
+; GFX1250-NEXT: s_lshr_b32 s6, s3, 12
+; GFX1250-NEXT: s_lshr_b32 s8, s3, 13
+; GFX1250-NEXT: s_lshr_b32 s14, s3, 8
+; GFX1250-NEXT: s_lshr_b32 s16, s3, 9
+; GFX1250-NEXT: s_lshr_b32 s18, s3, 6
+; GFX1250-NEXT: s_lshr_b32 s20, s3, 7
+; GFX1250-NEXT: s_lshr_b32 s22, s3, 4
+; GFX1250-NEXT: s_lshr_b32 s24, s3, 5
+; GFX1250-NEXT: s_lshr_b32 s26, s3, 2
+; GFX1250-NEXT: s_lshr_b32 s28, s3, 3
+; GFX1250-NEXT: s_lshr_b32 s30, s3, 1
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX1250-NEXT: v_bfe_i32 v28, v28, 0, 1
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX1250-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v17, s19
+; GFX1250-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX1250-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v23, s25
+; GFX1250-NEXT: v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v25, s27
+; GFX1250-NEXT: v_dual_mov_b32 v26, s28 :: v_dual_mov_b32 v27, s29
+; GFX1250-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v32, v[28:31], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6592,6 +7695,95 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001e
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: s_lshr_b32 s4, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001d
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10015
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10006
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10004
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -7300,6 +8492,141 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s34, s2, 30
+; GFX1250-NEXT: s_lshr_b32 s36, s2, 31
+; GFX1250-NEXT: s_lshr_b32 s38, s2, 28
+; GFX1250-NEXT: s_lshr_b32 s40, s2, 29
+; GFX1250-NEXT: s_lshr_b32 s42, s2, 26
+; GFX1250-NEXT: s_lshr_b32 s44, s2, 27
+; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s46, s2, 24
+; GFX1250-NEXT: s_lshr_b32 s48, s2, 25
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v1, s35 :: v_dual_mov_b32 v2, s36
+; GFX1250-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, s38
+; GFX1250-NEXT: s_lshr_b32 s26, s2, 22
+; GFX1250-NEXT: s_lshr_b32 s50, s2, 23
+; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s39 :: v_dual_mov_b32 v6, s40
+; GFX1250-NEXT: v_dual_mov_b32 v7, s41 :: v_dual_mov_b32 v8, s42
+; GFX1250-NEXT: s_lshr_b32 s52, s2, 20
+; GFX1250-NEXT: s_lshr_b32 s54, s2, 21
+; GFX1250-NEXT: v_dual_mov_b32 v9, s43 :: v_dual_mov_b32 v10, s44
+; GFX1250-NEXT: v_dual_mov_b32 v11, s45 :: v_dual_mov_b32 v12, s46
+; GFX1250-NEXT: s_lshr_b32 s56, s2, 18
+; GFX1250-NEXT: s_lshr_b32 s58, s2, 19
+; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s47 :: v_dual_mov_b32 v14, s48
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: v_mov_b32_e32 v15, s49
+; GFX1250-NEXT: s_lshr_b32 s60, s2, 16
+; GFX1250-NEXT: s_lshr_b32 s62, s2, 17
+; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s64, s2, 14
+; GFX1250-NEXT: s_lshr_b32 s66, s2, 15
+; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27
+; GFX1250-NEXT: v_dual_mov_b32 v2, s50 :: v_dual_mov_b32 v3, s51
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s52
+; GFX1250-NEXT: s_lshr_b32 s30, s2, 12
+; GFX1250-NEXT: s_lshr_b32 s28, s2, 13
+; GFX1250-NEXT: s_lshr_b32 s24, s2, 10
+; GFX1250-NEXT: s_lshr_b32 s22, s2, 11
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s53 :: v_dual_mov_b32 v6, s54
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s56
+; GFX1250-NEXT: s_lshr_b32 s20, s2, 8
+; GFX1250-NEXT: s_lshr_b32 s18, s2, 9
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v9, s57 :: v_dual_mov_b32 v10, s58
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s59 :: v_dual_mov_b32 v12, s60
+; GFX1250-NEXT: s_lshr_b32 s16, s2, 6
+; GFX1250-NEXT: s_lshr_b32 s14, s2, 7
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s61 :: v_dual_mov_b32 v14, s62
+; GFX1250-NEXT: v_dual_mov_b32 v15, s63 :: v_dual_mov_b32 v16, s64
+; GFX1250-NEXT: s_lshr_b32 s12, s2, 4
+; GFX1250-NEXT: s_lshr_b32 s10, s2, 5
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v17, s65 :: v_dual_mov_b32 v18, s66
+; GFX1250-NEXT: v_dual_mov_b32 v19, s67 :: v_dual_mov_b32 v20, s30
+; GFX1250-NEXT: s_lshr_b32 s8, s2, 2
+; GFX1250-NEXT: s_lshr_b32 s6, s2, 3
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v22, s28
+; GFX1250-NEXT: v_mov_b32_e32 v23, s29
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
+; GFX1250-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s20
+; GFX1250-NEXT: s_lshr_b32 s68, s2, 1
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s18
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s16
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v10, s14
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v12, s12
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v14, s10
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v16, s8
+; GFX1250-NEXT: v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v18, s6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v20, s4
+; GFX1250-NEXT: v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v22, s2
+; GFX1250-NEXT: v_mov_b32_e32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -8327,6 +9654,179 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10014
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10015
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004
+; GFX1250-NEXT: s_and_b32 s7, s2, 1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10006
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10004
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_lshr_b32 s4, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10015
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006
+; GFX1250-NEXT: v_mov_b32_e32 v7, v1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_mov_b32 s4, s3
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
+; GFX1250-NEXT: v_mov_b32_e32 v6, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_and_b64 s[2:3], s[4:5], 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
@@ -9703,6 +11203,271 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s96, s11, 30
+; GFX1250-NEXT: s_lshr_b32 s98, s11, 31
+; GFX1250-NEXT: s_lshr_b32 s92, s11, 28
+; GFX1250-NEXT: s_lshr_b32 s94, s11, 29
+; GFX1250-NEXT: s_lshr_b32 s78, s11, 26
+; GFX1250-NEXT: s_lshr_b32 s88, s11, 27
+; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s66, s11, 24
+; GFX1250-NEXT: s_lshr_b32 s74, s11, 25
+; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s96
+; GFX1250-NEXT: s_lshr_b32 s56, s11, 22
+; GFX1250-NEXT: s_lshr_b32 s62, s11, 23
+; GFX1250-NEXT: v_dual_mov_b32 v1, s97 :: v_dual_mov_b32 v2, s100
+; GFX1250-NEXT: v_dual_mov_b32 v3, s101 :: v_dual_mov_b32 v4, s92
+; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s44, s11, 20
+; GFX1250-NEXT: s_lshr_b32 s52, s11, 21
+; GFX1250-NEXT: s_lshr_b32 s30, s11, 18
+; GFX1250-NEXT: s_lshr_b32 s40, s11, 19
+; GFX1250-NEXT: s_lshr_b32 s18, s11, 16
+; GFX1250-NEXT: s_lshr_b32 s26, s11, 17
+; GFX1250-NEXT: s_lshr_b32 s2, s11, 14
+; GFX1250-NEXT: s_lshr_b32 s4, s11, 15
+; GFX1250-NEXT: v_dual_mov_b32 v5, s93 :: v_dual_mov_b32 v6, s94
+; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s78
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s6, s11, 12
+; GFX1250-NEXT: s_lshr_b32 s8, s11, 13
+; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s88
+; GFX1250-NEXT: v_dual_mov_b32 v13, s89 :: v_dual_mov_b32 v14, s66
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s12, s11, 10
+; GFX1250-NEXT: s_lshr_b32 s14, s11, 11
+; GFX1250-NEXT: v_dual_mov_b32 v15, s67 :: v_dual_mov_b32 v16, s74
+; GFX1250-NEXT: v_dual_mov_b32 v17, s75 :: v_dual_mov_b32 v18, s56
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s16, s11, 8
+; GFX1250-NEXT: s_lshr_b32 s20, s11, 9
+; GFX1250-NEXT: v_dual_mov_b32 v19, s57 :: v_dual_mov_b32 v20, s62
+; GFX1250-NEXT: v_dual_mov_b32 v21, s63 :: v_dual_mov_b32 v22, s44
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s22, s11, 6
+; GFX1250-NEXT: s_lshr_b32 s24, s11, 7
+; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s52
+; GFX1250-NEXT: v_dual_mov_b32 v25, s53 :: v_dual_mov_b32 v26, s30
+; GFX1250-NEXT: v_dual_mov_b32 v27, s31 :: v_dual_mov_b32 v28, s40
+; GFX1250-NEXT: v_dual_mov_b32 v29, s41 :: v_dual_mov_b32 v30, s18
+; GFX1250-NEXT: v_dual_mov_b32 v31, s19 :: v_dual_mov_b32 v32, s26
+; GFX1250-NEXT: v_mov_b32_e32 v33, s27
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:496
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:480
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:464
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:448
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:432
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:416
+; GFX1250-NEXT: global_store_b128 v8, v[26:29], s[0:1] offset:400
+; GFX1250-NEXT: global_store_b128 v8, v[30:33], s[0:1] offset:384
+; GFX1250-NEXT: s_wait_xcnt 0x7
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: s_wait_xcnt 0x6
+; GFX1250-NEXT: v_mov_b32_e32 v4, s6
+; GFX1250-NEXT: s_lshr_b32 s28, s11, 4
+; GFX1250-NEXT: s_lshr_b32 s34, s11, 5
+; GFX1250-NEXT: s_lshr_b32 s36, s11, 2
+; GFX1250-NEXT: s_lshr_b32 s38, s11, 3
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s8
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v10, s12
+; GFX1250-NEXT: s_lshr_b32 s42, s11, 1
+; GFX1250-NEXT: s_mov_b32 s46, s11
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v12, s14
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s16
+; GFX1250-NEXT: s_lshr_b32 s48, s10, 30
+; GFX1250-NEXT: s_lshr_b32 s50, s10, 31
+; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s17 :: v_dual_mov_b32 v16, s20
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v17, s21 :: v_dual_mov_b32 v18, s22
+; GFX1250-NEXT: s_lshr_b32 s54, s10, 28
+; GFX1250-NEXT: s_lshr_b32 s58, s10, 29
+; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s23 :: v_dual_mov_b32 v20, s24
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s28
+; GFX1250-NEXT: s_lshr_b32 s60, s10, 26
+; GFX1250-NEXT: s_lshr_b32 s64, s10, 27
+; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v24, s34
+; GFX1250-NEXT: v_mov_b32_e32 v25, s35
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:368
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:352
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:336
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:320
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:304
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:288
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37
+; GFX1250-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s46
+; GFX1250-NEXT: s_lshr_b32 s68, s10, 24
+; GFX1250-NEXT: s_lshr_b32 s70, s10, 25
+; GFX1250-NEXT: s_lshr_b32 s72, s10, 22
+; GFX1250-NEXT: s_lshr_b32 s76, s10, 23
+; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s47 :: v_dual_mov_b32 v6, s42
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v10, s48
+; GFX1250-NEXT: s_lshr_b32 s80, s10, 20
+; GFX1250-NEXT: s_lshr_b32 s82, s10, 21
+; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v12, s50
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s51 :: v_dual_mov_b32 v14, s54
+; GFX1250-NEXT: s_lshr_b32 s84, s10, 18
+; GFX1250-NEXT: s_lshr_b32 s86, s10, 19
+; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s55 :: v_dual_mov_b32 v16, s58
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s59 :: v_dual_mov_b32 v18, s60
+; GFX1250-NEXT: s_lshr_b32 s90, s10, 16
+; GFX1250-NEXT: s_lshr_b32 s98, s10, 17
+; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s61 :: v_dual_mov_b32 v20, s64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s65 :: v_dual_mov_b32 v22, s68
+; GFX1250-NEXT: s_lshr_b32 s96, s10, 14
+; GFX1250-NEXT: s_lshr_b32 s100, s10, 15
+; GFX1250-NEXT: s_lshr_b32 s94, s10, 13
+; GFX1250-NEXT: s_lshr_b32 s88, s10, 11
+; GFX1250-NEXT: s_lshr_b32 s74, s10, 9
+; GFX1250-NEXT: s_lshr_b32 s62, s10, 7
+; GFX1250-NEXT: s_lshr_b32 s52, s10, 5
+; GFX1250-NEXT: s_lshr_b32 s40, s10, 3
+; GFX1250-NEXT: s_lshr_b32 s26, s10, 1
+; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s69 :: v_dual_mov_b32 v24, s70
+; GFX1250-NEXT: v_mov_b32_e32 v25, s71
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:272
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:256
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s72 :: v_dual_mov_b32 v1, s73
+; GFX1250-NEXT: v_dual_mov_b32 v2, s76 :: v_dual_mov_b32 v3, s77
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s80
+; GFX1250-NEXT: s_lshr_b32 s92, s10, 12
+; GFX1250-NEXT: s_lshr_b32 s78, s10, 10
+; GFX1250-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s82
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s83 :: v_dual_mov_b32 v10, s84
+; GFX1250-NEXT: s_lshr_b32 s66, s10, 8
+; GFX1250-NEXT: s_lshr_b32 s56, s10, 6
+; GFX1250-NEXT: s_lshr_b32 s44, s10, 4
+; GFX1250-NEXT: s_lshr_b32 s30, s10, 2
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s85 :: v_dual_mov_b32 v12, s86
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s87 :: v_dual_mov_b32 v14, s90
+; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s91 :: v_dual_mov_b32 v16, s98
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s99 :: v_dual_mov_b32 v18, s96
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s97 :: v_dual_mov_b32 v20, s94
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s95 :: v_dual_mov_b32 v22, s92
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s93 :: v_dual_mov_b32 v24, s88
+; GFX1250-NEXT: v_mov_b32_e32 v25, s89
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s78 :: v_dual_mov_b32 v1, s79
+; GFX1250-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v3, s75
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s66
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s67 :: v_dual_mov_b32 v6, s62
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s63 :: v_dual_mov_b32 v10, s56
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s57 :: v_dual_mov_b32 v12, s52
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s44
+; GFX1250-NEXT: v_dual_mov_b32 v15, s45 :: v_dual_mov_b32 v16, s40
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v18, s30
+; GFX1250-NEXT: v_dual_mov_b32 v19, s31 :: v_dual_mov_b32 v20, s26
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v22, s18
+; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s10
+; GFX1250-NEXT: v_mov_b32_e32 v25, s11
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 0a938b0..6f7ee70 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -6,6 +6,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-HSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX9-HSA %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-NOHSA-LABEL: constant_load_i32:
@@ -83,6 +84,16 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load i32, ptr addrspace(4) %in
store i32 %ld, ptr addrspace(1) %out
@@ -170,6 +181,17 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v2i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <2 x i32>, ptr addrspace(4) %in
store <2 x i32> %ld, ptr addrspace(1) %out
@@ -268,6 +290,17 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v3i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <3 x i32>, ptr addrspace(4) %in
store <3 x i32> %ld, ptr addrspace(1) %out
@@ -364,6 +397,18 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v4i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <4 x i32>, ptr addrspace(4) %in
store <4 x i32> %ld, ptr addrspace(1) %out
@@ -497,6 +542,22 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v8i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
+; GFX1250-NEXT: v_mov_b32_e32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <8 x i32>, ptr addrspace(4) %in
store <8 x i32> %ld, ptr addrspace(1) %out
@@ -660,6 +721,25 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v9i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b32 s12, s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b32 v8, v9, s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <9 x i32>, ptr addrspace(4) %in
store <9 x i32> %ld, ptr addrspace(1) %out
@@ -829,6 +909,26 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v10, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v10i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[12:13], s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v10, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v10, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <10 x i32>, ptr addrspace(4) %in
store <10 x i32> %ld, ptr addrspace(1) %out
@@ -872,66 +972,66 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s10
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s11
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s10
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s11
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s8
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s9
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
-; GFX7-HSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s14
-; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-HSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
+; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX7-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX7-HSA-NEXT: s_endpgm
;
; GFX8-NOHSA-LABEL: constant_load_v11i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_load_v11i32:
@@ -969,25 +1069,25 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX9-HSA-LABEL: constant_load_v11i32:
; GFX9-HSA: ; %bb.0: ; %entry
; GFX9-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
-; GFX9-HSA-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-HSA-NEXT: v_mov_b32_e32 v8, 0
; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20
; GFX9-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX9-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x20
; GFX9-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s12
; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s4
; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s5
; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s6
; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] offset:16
-; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s14
-; GFX9-HSA-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9]
-; GFX9-HSA-NEXT: global_store_dwordx3 v7, v[4:6], s[8:9] offset:32
+; GFX9-HSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-HSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] offset:16
+; GFX9-HSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX9-HSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-HSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-HSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9]
+; GFX9-HSA-NEXT: global_store_dwordx3 v8, v[0:2], s[8:9] offset:32
; GFX9-HSA-NEXT: s_endpgm
;
; GFX12-LABEL: constant_load_v11i32:
@@ -995,20 +1095,40 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v8, s12
; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
; GFX12-NEXT: s_clause 0x2
-; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
; GFX12-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9]
+; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v11i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
+; GFX1250-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v8, s12
+; GFX1250-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v11, v[4:7], s[8:9]
+; GFX1250-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <11 x i32>, ptr addrspace(4) %in
store <11 x i32> %ld, ptr addrspace(1) %out
@@ -1187,6 +1307,27 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v12, v[8:11], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v12i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[12:15], s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v12, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX1250-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX1250-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1
+; GFX1250-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v12, v[8:11], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <12 x i32>, ptr addrspace(4) %in
store <12 x i32> %ld, ptr addrspace(1) %out
@@ -1396,6 +1537,28 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v16i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s12
+; GFX1250-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14
+; GFX1250-NEXT: v_dual_mov_b32 v3, s15 :: v_dual_mov_b32 v4, s8
+; GFX1250-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10
+; GFX1250-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v8, s4
+; GFX1250-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v10, s6
+; GFX1250-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v12, s0
+; GFX1250-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v14, s2
+; GFX1250-NEXT: v_mov_b32_e32 v15, s3
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[16:17]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <16 x i32>, ptr addrspace(4) %in
store <16 x i32> %ld, ptr addrspace(1) %out
@@ -1482,6 +1645,16 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i32_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = zext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1576,6 +1749,19 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i32_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = sext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1662,6 +1848,16 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i32_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1756,6 +1952,19 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i32_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1855,6 +2064,18 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i32_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -1968,6 +2189,21 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i32_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s4, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s5, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_mov_b32_e32 v3, s4
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -2099,6 +2335,21 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i32_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -2261,6 +2512,26 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i32_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_ashr_i32 s8, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s9, s6, 31
+; GFX1250-NEXT: s_ashr_i32 s2, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s3, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v1, s9
+; GFX1250-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_mov_b32_e32 v7, s2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -2461,6 +2732,27 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i32_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9]
+; GFX1250-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -2730,6 +3022,36 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i32_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s10
+; GFX1250-NEXT: s_ashr_i32 s16, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s17, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s14, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s15, s8, 31
+; GFX1250-NEXT: s_ashr_i32 s12, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s13, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v4, s8
+; GFX1250-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v1, s17
+; GFX1250-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v5, s15
+; GFX1250-NEXT: s_ashr_i32 s2, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s3, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v8, s6
+; GFX1250-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v9, s13
+; GFX1250-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v12, s4
+; GFX1250-NEXT: v_dual_mov_b32 v11, s12 :: v_dual_mov_b32 v13, s3
+; GFX1250-NEXT: v_mov_b32_e32 v15, s2
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -3207,6 +3529,58 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16
; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i32_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v28, 0 :: v_dual_mov_b32 v0, s14
+; GFX1250-NEXT: s_ashr_i32 s28, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s29, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s30, s13, 31
+; GFX1250-NEXT: s_ashr_i32 s33, s15, 31
+; GFX1250-NEXT: s_ashr_i32 s34, s14, 31
+; GFX1250-NEXT: s_ashr_i32 s26, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s27, s8, 31
+; GFX1250-NEXT: s_ashr_i32 s31, s12, 31
+; GFX1250-NEXT: s_ashr_i32 s24, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s25, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v4, s12
+; GFX1250-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v8, s10
+; GFX1250-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v12, s8
+; GFX1250-NEXT: v_dual_mov_b32 v26, s3 :: v_dual_mov_b32 v1, s34
+; GFX1250-NEXT: v_dual_mov_b32 v3, s33 :: v_dual_mov_b32 v5, s31
+; GFX1250-NEXT: v_dual_mov_b32 v7, s30 :: v_dual_mov_b32 v9, s29
+; GFX1250-NEXT: v_dual_mov_b32 v11, s28 :: v_dual_mov_b32 v13, s27
+; GFX1250-NEXT: s_ashr_i32 s22, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s23, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v16, s6
+; GFX1250-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v17, s25
+; GFX1250-NEXT: s_ashr_i32 s20, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s21, s2, 31
+; GFX1250-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v20, s4
+; GFX1250-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v21, s23
+; GFX1250-NEXT: s_ashr_i32 s18, s1, 31
+; GFX1250-NEXT: s_ashr_i32 s19, s0, 31
+; GFX1250-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v24, s2
+; GFX1250-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v25, s21
+; GFX1250-NEXT: v_mov_b32_e32 v27, s20
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v28, v[0:3], s[16:17] offset:112
+; GFX1250-NEXT: global_store_b128 v28, v[4:7], s[16:17] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s19
+; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s18
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v28, v[8:11], s[16:17] offset:80
+; GFX1250-NEXT: global_store_b128 v28, v[12:15], s[16:17] offset:64
+; GFX1250-NEXT: global_store_b128 v28, v[16:19], s[16:17] offset:48
+; GFX1250-NEXT: global_store_b128 v28, v[20:23], s[16:17] offset:32
+; GFX1250-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16
+; GFX1250-NEXT: global_store_b128 v28, v[0:3], s[16:17]
+; GFX1250-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3551,6 +3925,39 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i32_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v2, s13
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v2, s11
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s9
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17]
+; GFX1250-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -4460,6 +4867,113 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i32_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: v_mov_b32_e32 v24, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s49, s15, 31
+; GFX1250-NEXT: s_ashr_i32 s64, s31, 31
+; GFX1250-NEXT: s_ashr_i32 s65, s30, 31
+; GFX1250-NEXT: s_ashr_i32 s62, s29, 31
+; GFX1250-NEXT: s_ashr_i32 s63, s28, 31
+; GFX1250-NEXT: s_ashr_i32 s60, s27, 31
+; GFX1250-NEXT: s_ashr_i32 s61, s26, 31
+; GFX1250-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v2, s31
+; GFX1250-NEXT: v_dual_mov_b32 v4, s28 :: v_dual_mov_b32 v1, s65
+; GFX1250-NEXT: v_mov_b32_e32 v3, s64
+; GFX1250-NEXT: s_ashr_i32 s58, s25, 31
+; GFX1250-NEXT: s_ashr_i32 s59, s24, 31
+; GFX1250-NEXT: v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v8, s26
+; GFX1250-NEXT: v_dual_mov_b32 v5, s63 :: v_dual_mov_b32 v7, s62
+; GFX1250-NEXT: v_dual_mov_b32 v9, s61 :: v_dual_mov_b32 v10, s27
+; GFX1250-NEXT: v_dual_mov_b32 v11, s60 :: v_dual_mov_b32 v12, s24
+; GFX1250-NEXT: s_ashr_i32 s57, s23, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s59 :: v_dual_mov_b32 v14, s25
+; GFX1250-NEXT: v_mov_b32_e32 v15, s58
+; GFX1250-NEXT: s_ashr_i32 s24, s22, 31
+; GFX1250-NEXT: s_ashr_i32 s55, s21, 31
+; GFX1250-NEXT: s_ashr_i32 s56, s20, 31
+; GFX1250-NEXT: s_ashr_i32 s53, s19, 31
+; GFX1250-NEXT: s_ashr_i32 s54, s18, 31
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s24
+; GFX1250-NEXT: v_dual_mov_b32 v2, s23 :: v_dual_mov_b32 v3, s57
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s20
+; GFX1250-NEXT: s_ashr_i32 s51, s17, 31
+; GFX1250-NEXT: s_ashr_i32 s52, s16, 31
+; GFX1250-NEXT: v_dual_mov_b32 v5, s56 :: v_dual_mov_b32 v6, s21
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s18
+; GFX1250-NEXT: s_ashr_i32 s50, s14, 31
+; GFX1250-NEXT: v_dual_mov_b32 v9, s54 :: v_dual_mov_b32 v10, s19
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s53 :: v_dual_mov_b32 v12, s16
+; GFX1250-NEXT: s_ashr_i32 s45, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s46, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s47, s13, 31
+; GFX1250-NEXT: s_ashr_i32 s48, s12, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s52 :: v_dual_mov_b32 v14, s17
+; GFX1250-NEXT: v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v16, s14
+; GFX1250-NEXT: s_ashr_i32 s43, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s44, s8, 31
+; GFX1250-NEXT: v_dual_mov_b32 v17, s50 :: v_dual_mov_b32 v18, s15
+; GFX1250-NEXT: v_dual_mov_b32 v19, s49 :: v_dual_mov_b32 v20, s12
+; GFX1250-NEXT: s_ashr_i32 s41, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s42, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v21, s48 :: v_dual_mov_b32 v22, s13
+; GFX1250-NEXT: v_mov_b32_e32 v23, s47
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s46
+; GFX1250-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, s45
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s8
+; GFX1250-NEXT: s_ashr_i32 s39, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s40, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v5, s44 :: v_dual_mov_b32 v6, s9
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v8, s6
+; GFX1250-NEXT: s_ashr_i32 s35, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s38, s2, 31
+; GFX1250-NEXT: v_dual_mov_b32 v9, s42 :: v_dual_mov_b32 v10, s7
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v11, s41 :: v_dual_mov_b32 v12, s4
+; GFX1250-NEXT: s_ashr_i32 s33, s1, 31
+; GFX1250-NEXT: s_ashr_i32 s34, s0, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s5
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v15, s39 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s38 :: v_dual_mov_b32 v18, s3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v19, s35 :: v_dual_mov_b32 v20, s0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s34 :: v_dual_mov_b32 v22, s1
+; GFX1250-NEXT: v_mov_b32_e32 v23, s33
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -5100,6 +5614,65 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i32_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s31
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v2, s29
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v2, s27
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v2, s25
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v2, s23
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v2, s21
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v2, s19
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v2, s17
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v2, s15
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v2, s13
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v2, s11
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s9
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -5472,6 +6045,42 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v0, s28
+; GFX1250-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s30
+; GFX1250-NEXT: v_dual_mov_b32 v3, s31 :: v_dual_mov_b32 v4, s24
+; GFX1250-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v6, s26
+; GFX1250-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s20
+; GFX1250-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX1250-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s16
+; GFX1250-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v14, s18
+; GFX1250-NEXT: v_dual_mov_b32 v15, s19 :: v_dual_mov_b32 v16, s12
+; GFX1250-NEXT: v_dual_mov_b32 v17, s13 :: v_dual_mov_b32 v18, s14
+; GFX1250-NEXT: v_dual_mov_b32 v19, s15 :: v_dual_mov_b32 v20, s8
+; GFX1250-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v22, s10
+; GFX1250-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v24, s4
+; GFX1250-NEXT: v_dual_mov_b32 v25, s5 :: v_dual_mov_b32 v26, s6
+; GFX1250-NEXT: v_dual_mov_b32 v27, s7 :: v_dual_mov_b32 v28, s0
+; GFX1250-NEXT: v_dual_mov_b32 v29, s1 :: v_dual_mov_b32 v30, s2
+; GFX1250-NEXT: v_mov_b32_e32 v31, s3
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
+; GFX1250-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
+; GFX1250-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
+; GFX1250-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
+; GFX1250-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
+; GFX1250-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
+; GFX1250-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
+; GFX1250-NEXT: global_store_b128 v32, v[28:31], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
store <32 x i32> %ld, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index a42c71c..c1a32aa 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -1259,13 +1259,12 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -1371,13 +1370,12 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -1646,13 +1644,12 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -1763,13 +1760,12 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -2044,13 +2040,12 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -2153,13 +2148,12 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -2419,11 +2413,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2531,11 +2525,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2795,10 +2789,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -2882,10 +2875,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -3095,8 +3087,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
@@ -3177,8 +3169,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 8351d28..739e86d 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -803,14 +803,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -918,14 +918,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -1199,14 +1199,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -1319,14 +1319,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -1606,14 +1606,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -1718,14 +1718,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -1990,13 +1990,12 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2107,13 +2106,12 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2379,11 +2377,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -2469,11 +2467,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -2688,10 +2686,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
@@ -2775,10 +2772,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 0c4aca8..6da8026 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -803,14 +803,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -918,14 +918,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -1199,14 +1199,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -1319,14 +1319,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -1606,14 +1606,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -1718,14 +1718,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -1990,13 +1990,12 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2107,13 +2106,12 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2379,11 +2377,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -2469,11 +2467,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l
; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -2688,10 +2686,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
@@ -2775,10 +2772,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 37310b6..786989cc 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -1721,13 +1721,12 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -1833,13 +1832,12 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
@@ -2108,13 +2106,12 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -2225,13 +2222,12 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
@@ -2506,13 +2502,12 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -2615,13 +2610,12 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
@@ -2881,11 +2875,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -2993,11 +2987,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
@@ -3257,10 +3251,9 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -3344,10 +3337,9 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
@@ -3557,8 +3549,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
@@ -3639,8 +3631,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a3ebaec..5f0ca7b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -74,7 +74,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
@@ -175,7 +176,9 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_add_i32 s1, s33, s0
+; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000
@@ -223,30 +226,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
; MUBUF-NEXT: ; %bb.2: ; %split
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_movk_i32 s4, 0x4000
; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12c4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1
-; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s4
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12cc
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12c8
; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000
; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6
; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc
@@ -298,7 +306,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll
index a0c1e57..a09e392 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll
@@ -223,3 +223,43 @@ define i32 @fancy_zero() {
ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7))
to i32)
}
+
+define i32 @load_null() {
+; CHECK-LABEL: define i32 @load_null
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret i32 [[X]]
+;
+ %x = load i32, ptr addrspace(7) null, align 4
+ ret i32 %x
+}
+
+define void @store_null() {
+; CHECK-LABEL: define void @store_null
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 null, i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr addrspace(7) null, align 4
+ ret void
+}
+
+define i32 @load_poison() {
+; CHECK-LABEL: define i32 @load_poison
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: [[X:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0)
+; CHECK-NEXT: ret i32 [[X]]
+;
+ %x = load i32, ptr addrspace(7) poison, align 4
+ ret i32 %x
+}
+
+define void @store_poison() {
+; CHECK-LABEL: define void @store_poison
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 0, ptr addrspace(8) align 4 poison, i32 poison, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+ store i32 0, ptr addrspace(7) poison, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
index 63c0463..66de953 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
@@ -255,6 +255,56 @@ define i32 @ptrtoint_offset(ptr addrspace(7) %ptr) {
ret i32 %ret
}
+define i32 @ptrtoaddr(ptr addrspace(7) %ptr) {
+; CHECK-LABEL: define i32 @ptrtoaddr
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
+; CHECK-NEXT: [[RET:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
+; CHECK-NEXT: ret i32 [[RET]]
+;
+ %ret = ptrtoaddr ptr addrspace(7) %ptr to i32
+ ret i32 %ret
+}
+
+define <2 x i32> @ptrtoaddr_vec(<2 x ptr addrspace(7)> %ptr) {
+; CHECK-LABEL: define <2 x i32> @ptrtoaddr_vec
+; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[PTR]], 0
+; CHECK-NEXT: [[RET:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[PTR]], 1
+; CHECK-NEXT: ret <2 x i32> [[RET]]
+;
+ %ret = ptrtoaddr <2 x ptr addrspace(7)> %ptr to <2 x i32>
+ ret <2 x i32> %ret
+}
+
+;; Check that we extend the offset to i160.
+define i160 @ptrtoaddr_ext(ptr addrspace(7) %ptr) {
+; CHECK-LABEL: define i160 @ptrtoaddr_ext
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
+; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
+; CHECK-NEXT: [[RET:%.*]] = zext i32 [[PTR_OFF]] to i160
+; CHECK-NEXT: ret i160 [[RET]]
+;
+ %addr = ptrtoaddr ptr addrspace(7) %ptr to i32
+ %ext = zext i32 %addr to i160
+ ret i160 %ext
+}
+
+;; Check that we truncate the offset to i16.
+define i16 @ptrtoaddr_trunc(ptr addrspace(7) %ptr) {
+; CHECK-LABEL: define i16 @ptrtoaddr_trunc
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
+; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
+; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[PTR_OFF]] to i16
+; CHECK-NEXT: ret i16 [[RET]]
+;
+ %addr = ptrtoaddr ptr addrspace(7) %ptr to i32
+ %trunc = trunc i32 %addr to i16
+ ret i16 %trunc
+}
+
define ptr addrspace(7) @inttoptr(i160 %v) {
; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr
; CHECK-SAME: (i160 [[V:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
index 8281320..42a4829c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll
@@ -15,51 +15,52 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]])
; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]]
; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]]
-; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]]
-; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]])
-; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]])
-; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]]
-; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]]
-; CHECK-NEXT: [[BUF_PTR_2:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG27:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[BUF_PTR_2]], 32, !dbg [[DBG27]]
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]]
-; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]]
-; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]]
-; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]])
-; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]])
-; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]]
-; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]])
-; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]])
-; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]]
-; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]]
-; CHECK-NEXT: [[BUF_PTR_3_INT:%.*]] = or i160 [[TMP3]], [[BUF_PTR_3_INT_OFF]], !dbg [[DBG29]]
-; CHECK-NEXT: store i160 [[BUF_PTR_3_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG29]]
-; CHECK-NEXT: [[BUF_PTR_4:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG30:![0-9]+]]
-; CHECK-NEXT: [[TMP4:%.*]] = lshr i160 [[BUF_PTR_4]], 32, !dbg [[DBG30]]
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]]
-; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]]
-; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]]
-; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]])
-; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]])
-; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]]
-; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]])
-; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]]
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i160 [[AUX_PTR_2]], 32, !dbg [[DBG32]]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]]
-; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]]
-; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]]
-; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]])
-; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]])
-; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]]
-; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]]
-; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]]
-; CHECK-NEXT: [[BUF_PTR_4_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_LEGAL]], i64 4, !dbg [[DBG33]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]]
-; CHECK-NEXT: ret float [[RET]], !dbg [[DBG34:![0-9]+]]
+; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]], !DIAssignID [[DIASSIGNID25:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(i160 [[BUF_PTR_INT]], [[META13]], !DIExpression(), [[DIASSIGNID25]], ptr addrspace(5) [[BUF_PTR_VAR]], !DIExpression(), [[DBG21]])
+; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META26:![0-9]+]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META26]])
+; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG27]]
+; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG27]]
+; CHECK-NEXT: [[BUF_PTR_2:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[BUF_PTR_2]], 32, !dbg [[DBG28]]
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG28]]
+; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG28]]
+; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG28]]
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]])
+; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG29:![0-9]+]]
+; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG29]]
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG29]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG29]])
+; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG30]]
+; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG30]]
+; CHECK-NEXT: [[BUF_PTR_3_INT:%.*]] = or i160 [[TMP3]], [[BUF_PTR_3_INT_OFF]], !dbg [[DBG30]]
+; CHECK-NEXT: store i160 [[BUF_PTR_3_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG30]]
+; CHECK-NEXT: [[BUF_PTR_4:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG31:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i160 [[BUF_PTR_4]], 32, !dbg [[DBG31]]
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG31]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG31]]
+; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG31]]
+; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG31]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG31]])
+; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG32:![0-9]+]]
+; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG32]])
+; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT: [[TMP6:%.*]] = lshr i160 [[AUX_PTR_2]], 32, !dbg [[DBG33]]
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG33]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG33]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG33]]
+; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG33]])
+; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG33]])
+; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG34:![0-9]+]]
+; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG34]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG34]]
+; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG34]]
+; CHECK-NEXT: [[BUF_PTR_4_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_LEGAL]], i64 4, !dbg [[DBG34]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG34]]
+; CHECK-NEXT: ret float [[RET]], !dbg [[DBG35:![0-9]+]]
;
%buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20
call void @llvm.dbg.value(metadata ptr addrspace(5) %buf.ptr.var, metadata !9, metadata !DIExpression()), !dbg !20
@@ -67,7 +68,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace
call void @llvm.dbg.value(metadata ptr addrspace(5) %aux.ptr.var, metadata !11, metadata !DIExpression()), !dbg !21
%buf.ptr = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7), !dbg !22
call void @llvm.dbg.value(metadata ptr addrspace(7) %buf.ptr, metadata !12, metadata !DIExpression()), !dbg !22
- store ptr addrspace(7) %buf.ptr, ptr addrspace(5) %buf.ptr.var, align 32, !dbg !23
+ store ptr addrspace(7) %buf.ptr, ptr addrspace(5) %buf.ptr.var, align 32, !dbg !23, !DIAssignID !40
+ call void @llvm.dbg.assign(metadata ptr addrspace(7) %buf.ptr, metadata !12, metadata !DIExpression(), metadata !40, metadata ptr addrspace(5) %buf.ptr.var, metadata !DIExpression()), !dbg !20
%aux.ptr = addrspacecast ptr addrspace(8) %aux to ptr addrspace(7), !dbg !24
call void @llvm.dbg.value(metadata ptr addrspace(7) %aux.ptr, metadata !14, metadata !DIExpression()), !dbg !24
store ptr addrspace(7) %aux.ptr, ptr addrspace(5) %aux.ptr.var, align 32, !dbg !25
@@ -129,3 +131,4 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
!31 = !DILocation(line: 12, column: 1, scope: !5)
!32 = !DILocation(line: 13, column: 1, scope: !5)
!33 = !DILocation(line: 14, column: 1, scope: !5)
+!40 = distinct !DIAssignID()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll
new file mode 100644
index 0000000..bc70c3b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX11,GFX11-NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX11,OPT-WAVE32,GFX11-OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX11,OPT-WAVE64,GFX11-OPT-WAVE64 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX12,GFX12-NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX12,OPT-WAVE32,GFX12-OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX12,OPT-WAVE64,GFX12-OPT-WAVE64 %s
+
+define amdgpu_kernel void @barrier() {
+; GFX11-LABEL: define amdgpu_kernel void @barrier(
+; GFX11-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NEXT: ret void
+;
+; GFX12-LABEL: define amdgpu_kernel void @barrier(
+; GFX12-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" {
+; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; GFX11-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: ret void
+;
+; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; GFX12-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NOOPT-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" {
+; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX11-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NOOPT-NEXT: ret void
+;
+; GFX11-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX11-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX11-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: ret void
+;
+; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX12-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NOOPT-NEXT: ret void
+;
+; GFX12-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX12-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-OPT-WAVE32-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-OPT-WAVE64: {{.*}}
+; GFX12-OPT-WAVE64: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll
new file mode 100644
index 0000000..69ad4b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s
+
+declare void @foo(i1)
+
+define amdgpu_kernel void @barrier() {
+; CHECK-LABEL: define amdgpu_kernel void @barrier(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; CHECK-NEXT: call void @foo(i1 [[ISFIRST]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
+
+define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" {
+; NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]])
+; NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE32-NEXT: call void @foo(i1 true)
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: call void @foo(i1 true)
+; OPT-WAVE64-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
+
+define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" {
+; NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]])
+; NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; OPT-WAVE32-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; OPT-WAVE32-NEXT: call void @foo(i1 [[ISFIRST]])
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: call void @foo(i1 true)
+; OPT-WAVE64-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index 6e92677..247a0a9 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v1v:
@@ -19,7 +20,9 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_v5v:
; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX942-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX1250-NEXT: s_delay_alu
+; GFX1250-NEXT: v_add_nc_u64_e32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
%shl = shl i64 %v, 5
%add = add i64 %shl, %a
ret i64 %add
@@ -28,7 +31,9 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
; GCN-LABEL: lshl_add_u64_vvv:
; GCN: v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX942-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX1250-NEXT: s_delay_alu
+; GFX1250-NEXT: v_add_nc_u64_e32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
%shl = shl i64 %v, %s
%add = add i64 %shl, %a
ret i64 %add
@@ -57,8 +62,9 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
; GCN-LABEL: lshl_add_u64_s2s:
; GCN: s_lshl_b64
-; GCN: s_add_u32
-; GCN: s_addc_u32
+; GFX942: s_add_u32
+; GFX942: s_addc_u32
+; GFX1250: s_add_nc_u64
%shl = shl i64 %v, 2
%add = add i64 %shl, %a
store i64 %add, ptr poison
@@ -67,14 +73,16 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
define i64 @add_u64_vv(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_vv:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
%add = add i64 %v, %a
ret i64 %add
}
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX1250: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
%a = load i64, ptr poison
%add = add i64 %v, %a
store i64 %add, ptr poison
@@ -83,7 +91,8 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
%v = load i64, ptr poison
%add = add i64 %v, %a
store i64 %add, ptr poison
@@ -92,8 +101,9 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
-; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GFX942: s_add_u32
+; GFX942: s_addc_u32 s1, s1, s3
+; GFX1250: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
%add = add i64 %v, %a
store i64 %add, ptr poison
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
index cd428be..966e5c8 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -35,7 +35,8 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
; CHECK-LABEL: {{^}}name: realign_stack
; CHECK: scratchReservedForDynamicVGPRs: 512
%v = alloca <32 x i32>, align 128, addrspace(5)
- store <32 x i32> %x, ptr addrspace(5) %v
+ ; use volatile store to avoid promotion of alloca to registers
+ store volatile <32 x i32> %x, ptr addrspace(5) %v
call amdgpu_gfx void @callee(i32 71)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
index 23412aa..3b3ea3f 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
@@ -347,8 +347,10 @@ body: |
...
# User-requested maximum number of VGPRs need to be taken into account by
# the scheduler's rematerialization stage. Register usage above that number
-# is considered like spill; occupancy is "inadvertently" increased when
-# eliminating spill.
+# is considered like spill. On unified RF (gfx90a), the requested number is
+# understood "per-bank", effectively doubling its value, so no rematerialization
+# is necessary.
+---
name: small_num_vgprs_as_spill
tracksRegLiveness: true
machineFunctionInfo:
@@ -371,36 +373,15 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_15]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: small_num_vgprs_as_spill
@@ -420,36 +401,15 @@ body: |
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1:
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
- ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]]
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
- ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_31]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]]
; GFX90A-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -467,38 +427,16 @@ body: |
%10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
%11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
%12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
- %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
- %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
- %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
- %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
- %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
- %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
- %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
- %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
- %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
- %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
- %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
- %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
- %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
- %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
- %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
- %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
- %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
- %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
- %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
- %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode
+ %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode
+ %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode
+ %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode
bb.1:
S_NOP 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4
S_NOP 0, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
S_NOP 0, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14
- S_NOP 0, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19
- S_NOP 0, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24
- S_NOP 0, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29
- S_NOP 0, implicit %30, implicit %31, implicit %32, implicit %33
-
+ S_NOP 0, implicit %15
S_ENDPGM 0
...
# Min/Max occupancy is 8, but user requests 7, the scheduler's rematerialization
@@ -815,9 +753,9 @@ body: |
S_ENDPGM 0
...
# Min/Max waves/EU is 8. For targets with non-unified RF (gfx908) we are able to
-# eliminate both ArchVGPR and AGPR spilling by saving 2 VGPRs. In the unified RF
-# case (gfx90a) the ArchVGPR allocation granule forces us to remat more
-# ArchVGPRs to eliminate spilling.
+# eliminate both ArchVGPR and AGPR spilling by saving one of each. In the
+# unified RF case (gfx90a) the ArchVGPR allocation granule may force us to remat
+# more ArchVGPRs to eliminate spilling.
---
name: reduce_arch_and_acc_vgrp_spill
tracksRegLiveness: true
@@ -860,6 +798,7 @@ body: |
; GFX908-NEXT: [[DEF28:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF29:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF30:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -886,12 +825,11 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 64, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]]
@@ -899,17 +837,17 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_14]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]], implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]], implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]], implicit [[V_CVT_I32_F64_e32_24]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
- ; GFX908-NEXT: [[DEF31:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_25]], implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_32]], implicit [[V_CVT_I32_F64_e32_27]], implicit [[V_CVT_I32_F64_e32_28]]
; GFX908-NEXT: [[DEF32:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[V_CVT_I32_F64_e32_32]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF30]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_29]], implicit [[V_CVT_I32_F64_e32_30]], implicit [[DEF32]], implicit [[DEF]], implicit [[DEF1]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[V_CVT_I32_F64_e32_31]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF31]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: reduce_arch_and_acc_vgrp_spill
@@ -1358,8 +1296,7 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_252:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 252, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
@@ -1387,7 +1324,8 @@ body: |
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
- ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[DEF]]
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: reduce_spill_archvgpr_above_addressable_limit
@@ -1395,6 +1333,7 @@ body: |
; GFX90A-NEXT: successors: %bb.1(0x80000000)
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -1650,8 +1589,6 @@ body: |
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_253:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 253, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_254:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 254, implicit $exec, implicit $mode, implicit-def $m0
; GFX90A-NEXT: [[V_CVT_I32_F64_e32_255:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 256, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1:
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_9]]
@@ -1679,6 +1616,7 @@ body: |
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_220]], implicit [[V_CVT_I32_F64_e32_221]], implicit [[V_CVT_I32_F64_e32_222]], implicit [[V_CVT_I32_F64_e32_223]], implicit [[V_CVT_I32_F64_e32_224]], implicit [[V_CVT_I32_F64_e32_225]], implicit [[V_CVT_I32_F64_e32_226]], implicit [[V_CVT_I32_F64_e32_227]], implicit [[V_CVT_I32_F64_e32_228]], implicit [[V_CVT_I32_F64_e32_229]]
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_230]], implicit [[V_CVT_I32_F64_e32_231]], implicit [[V_CVT_I32_F64_e32_232]], implicit [[V_CVT_I32_F64_e32_233]], implicit [[V_CVT_I32_F64_e32_234]], implicit [[V_CVT_I32_F64_e32_235]], implicit [[V_CVT_I32_F64_e32_236]], implicit [[V_CVT_I32_F64_e32_237]], implicit [[V_CVT_I32_F64_e32_238]], implicit [[V_CVT_I32_F64_e32_239]]
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_240]], implicit [[V_CVT_I32_F64_e32_241]], implicit [[V_CVT_I32_F64_e32_242]], implicit [[V_CVT_I32_F64_e32_243]], implicit [[V_CVT_I32_F64_e32_244]], implicit [[V_CVT_I32_F64_e32_245]], implicit [[V_CVT_I32_F64_e32_246]], implicit [[V_CVT_I32_F64_e32_247]], implicit [[V_CVT_I32_F64_e32_248]], implicit [[V_CVT_I32_F64_e32_249]]
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_256:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 255, implicit $exec, implicit $mode
; GFX90A-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_250]], implicit [[V_CVT_I32_F64_e32_251]], implicit [[V_CVT_I32_F64_e32_252]], implicit [[V_CVT_I32_F64_e32_253]], implicit [[V_CVT_I32_F64_e32_254]], implicit [[V_CVT_I32_F64_e32_256]], implicit [[V_CVT_I32_F64_e32_255]], implicit [[DEF]]
; GFX90A-NEXT: S_ENDPGM 0
bb.0:
@@ -2246,35 +2184,35 @@ body: |
; GFX908-NEXT: [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
- ; GFX908-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF128]], implicit [[DEF129]], implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF138]], implicit [[DEF139]], implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF148]], implicit [[DEF149]], implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF158]], implicit [[DEF159]], implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF168]], implicit [[DEF169]], implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF178]], implicit [[DEF179]], implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF188]], implicit [[DEF189]], implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF198]], implicit [[DEF199]], implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF208]], implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF218]], implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF228]], implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF238]], implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF248]], implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[DEF]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]], implicit [[DEF70]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]], implicit [[DEF80]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]], implicit [[DEF90]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]], implicit [[DEF100]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]], implicit [[DEF110]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]], implicit [[DEF120]]
- ; GFX908-NEXT: S_NOP 0, implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+ ; GFX908-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF256]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF69]], implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF79]], implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF89]], implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF99]], implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF109]], implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF119]], implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF129]], implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF139]], implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF149]], implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF159]], implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF169]], implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF179]], implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF189]], implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF199]], implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: reduce_spill_agpr_above_addressable_limit
@@ -2533,41 +2471,41 @@ body: |
; GFX90A-NEXT: [[DEF249:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[DEF250:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[DEF251:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
- ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
; GFX90A-NEXT: [[DEF252:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[DEF253:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[DEF254:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[DEF255:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 257, implicit $exec, implicit $mode
+ ; GFX90A-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 258, implicit $exec, implicit $mode
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.1:
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[DEF256]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]], implicit [[DEF69]], implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]], implicit [[DEF79]], implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]], implicit [[DEF89]], implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]], implicit [[DEF99]], implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]], implicit [[DEF109]], implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]], implicit [[DEF119]], implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]], implicit [[DEF129]], implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]], implicit [[DEF139]], implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]], implicit [[DEF149]], implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]], implicit [[DEF159]], implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]], implicit [[DEF169]], implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]], implicit [[DEF179]], implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]], implicit [[DEF189]], implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]], implicit [[DEF199]], implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]], implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]], implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]], implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]], implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]]
- ; GFX90A-NEXT: S_NOP 0, implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]], implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
+ ; GFX90A-NEXT: [[DEF256:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF256]], implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]], implicit [[DEF67]], implicit [[DEF68]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF69]], implicit [[DEF70]], implicit [[DEF71]], implicit [[DEF72]], implicit [[DEF73]], implicit [[DEF74]], implicit [[DEF75]], implicit [[DEF76]], implicit [[DEF77]], implicit [[DEF78]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF79]], implicit [[DEF80]], implicit [[DEF81]], implicit [[DEF82]], implicit [[DEF83]], implicit [[DEF84]], implicit [[DEF85]], implicit [[DEF86]], implicit [[DEF87]], implicit [[DEF88]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF89]], implicit [[DEF90]], implicit [[DEF91]], implicit [[DEF92]], implicit [[DEF93]], implicit [[DEF94]], implicit [[DEF95]], implicit [[DEF96]], implicit [[DEF97]], implicit [[DEF98]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF99]], implicit [[DEF100]], implicit [[DEF101]], implicit [[DEF102]], implicit [[DEF103]], implicit [[DEF104]], implicit [[DEF105]], implicit [[DEF106]], implicit [[DEF107]], implicit [[DEF108]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF109]], implicit [[DEF110]], implicit [[DEF111]], implicit [[DEF112]], implicit [[DEF113]], implicit [[DEF114]], implicit [[DEF115]], implicit [[DEF116]], implicit [[DEF117]], implicit [[DEF118]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF119]], implicit [[DEF120]], implicit [[DEF121]], implicit [[DEF122]], implicit [[DEF123]], implicit [[DEF124]], implicit [[DEF125]], implicit [[DEF126]], implicit [[DEF127]], implicit [[DEF128]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF129]], implicit [[DEF130]], implicit [[DEF131]], implicit [[DEF132]], implicit [[DEF133]], implicit [[DEF134]], implicit [[DEF135]], implicit [[DEF136]], implicit [[DEF137]], implicit [[DEF138]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF139]], implicit [[DEF140]], implicit [[DEF141]], implicit [[DEF142]], implicit [[DEF143]], implicit [[DEF144]], implicit [[DEF145]], implicit [[DEF146]], implicit [[DEF147]], implicit [[DEF148]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF149]], implicit [[DEF150]], implicit [[DEF151]], implicit [[DEF152]], implicit [[DEF153]], implicit [[DEF154]], implicit [[DEF155]], implicit [[DEF156]], implicit [[DEF157]], implicit [[DEF158]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF159]], implicit [[DEF160]], implicit [[DEF161]], implicit [[DEF162]], implicit [[DEF163]], implicit [[DEF164]], implicit [[DEF165]], implicit [[DEF166]], implicit [[DEF167]], implicit [[DEF168]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF169]], implicit [[DEF170]], implicit [[DEF171]], implicit [[DEF172]], implicit [[DEF173]], implicit [[DEF174]], implicit [[DEF175]], implicit [[DEF176]], implicit [[DEF177]], implicit [[DEF178]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF179]], implicit [[DEF180]], implicit [[DEF181]], implicit [[DEF182]], implicit [[DEF183]], implicit [[DEF184]], implicit [[DEF185]], implicit [[DEF186]], implicit [[DEF187]], implicit [[DEF188]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF189]], implicit [[DEF190]], implicit [[DEF191]], implicit [[DEF192]], implicit [[DEF193]], implicit [[DEF194]], implicit [[DEF195]], implicit [[DEF196]], implicit [[DEF197]], implicit [[DEF198]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF199]], implicit [[DEF200]], implicit [[DEF201]], implicit [[DEF202]], implicit [[DEF203]], implicit [[DEF204]], implicit [[DEF205]], implicit [[DEF206]], implicit [[DEF207]], implicit [[DEF208]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF209]], implicit [[DEF210]], implicit [[DEF211]], implicit [[DEF212]], implicit [[DEF213]], implicit [[DEF214]], implicit [[DEF215]], implicit [[DEF216]], implicit [[DEF217]], implicit [[DEF218]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF219]], implicit [[DEF220]], implicit [[DEF221]], implicit [[DEF222]], implicit [[DEF223]], implicit [[DEF224]], implicit [[DEF225]], implicit [[DEF226]], implicit [[DEF227]], implicit [[DEF228]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF229]], implicit [[DEF230]], implicit [[DEF231]], implicit [[DEF232]], implicit [[DEF233]], implicit [[DEF234]], implicit [[DEF235]], implicit [[DEF236]], implicit [[DEF237]], implicit [[DEF238]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF239]], implicit [[DEF240]], implicit [[DEF241]], implicit [[DEF242]], implicit [[DEF243]], implicit [[DEF244]], implicit [[DEF245]], implicit [[DEF246]], implicit [[DEF247]], implicit [[DEF248]]
+ ; GFX90A-NEXT: S_NOP 0, implicit [[DEF249]], implicit [[DEF250]], implicit [[DEF251]], implicit [[DEF252]], implicit [[DEF253]], implicit [[DEF254]], implicit [[DEF255]], implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
; GFX90A-NEXT: S_ENDPGM 0
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index f69337e..06d8474 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -2104,13 +2104,9 @@ body: |
; GFX908-NEXT: [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
; GFX908-NEXT: [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
; GFX908-NEXT: [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
- ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
; GFX908-NEXT: [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
; GFX908-NEXT: [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
@@ -2120,7 +2116,11 @@ body: |
; GFX908-NEXT: [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 81
; GFX908-NEXT: [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 82
; GFX908-NEXT: [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 83
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 84
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index 553d7e0..680942fcb 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -279,11 +279,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_16: ; %Flow45
+; CHECK-NEXT: .LBB0_16: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
; CHECK-NEXT: v_mov_b32_e32 v57, v0
-; CHECK-NEXT: .LBB0_17: ; %Flow46
+; CHECK-NEXT: .LBB0_17: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: s_mov_b32 s55, exec_lo
@@ -330,11 +330,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
-; CHECK-NEXT: .LBB0_22: ; %Flow43
+; CHECK-NEXT: .LBB0_22: ; %Flow41
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
-; CHECK-NEXT: .LBB0_23: ; %Flow44
+; CHECK-NEXT: .LBB0_23: ; %Flow42
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
@@ -347,7 +347,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_or_b32 s53, s4, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
-; CHECK-NEXT: .LBB0_25: ; %Flow51
+; CHECK-NEXT: .LBB0_25: ; %Flow49
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
index 11cda2d..c96ba75 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -199,7 +199,6 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_negabsf32(bfloat %src0, bfloat %src1,
ret float %result
}
-
define float @v_mad_mix_f32_bf16lo_bf16lo_f32imm1(bfloat %src0, bfloat %src1) #0 {
; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imm1:
; GFX1250: ; %bb.0:
@@ -230,7 +229,6 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi(bfloat %src0, bfloat %src
ret float %result
}
-
define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat %src1) #0 {
; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi:
; GFX1250: ; %bb.0:
@@ -247,7 +245,6 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat
ret float %result
}
-
define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63(bfloat %src0, bfloat %src1) #0 {
; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63:
; GFX1250: ; %bb.0:
@@ -360,7 +357,6 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
ret float %result
}
-
define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, bfloat %src2) #1 {
; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals:
; GFX1250: ; %bb.0:
@@ -469,7 +465,6 @@ define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat
ret float %result
}
-
define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 {
; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo:
; GFX1250: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 1b2eb83..03304ae 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -74,9 +74,7 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext bfloat %src0 to float
%src1.ext = fpext bfloat %src1 to float
@@ -105,7 +103,6 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt(bfloat %src0, bf
ret bfloat %cvt.result
}
-
define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
; GFX1250-LABEL: v_mad_mix_v2f32:
; GFX1250: ; %bb.0:
@@ -178,7 +175,6 @@ define <4 x bfloat> @v_mad_mix_v4f32(<4 x bfloat> %src0, <4 x bfloat> %src1, <4
ret <4 x bfloat> %cvt.result
}
-
define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GFX1250: ; %bb.0:
@@ -191,9 +187,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
@@ -205,7 +199,6 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
ret <2 x bfloat> %clamp
}
-
define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GFX1250: ; %bb.0:
@@ -247,11 +240,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3]
; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
-; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 clamp
+; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
@@ -323,7 +313,6 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b
ret <2 x bfloat> %insert
}
-
define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 {
; GFX1250-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GFX1250: ; %bb.0:
@@ -351,7 +340,6 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloa
ret <2 x bfloat> %cvt.result
}
-
define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 {
; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GFX1250: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 811e255..eab9266 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -2382,13 +2382,22 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
}
define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
-; GFX1100-LABEL: mixlo_zext:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: mixlo_zext:
; GFX900: ; %bb.0:
@@ -2418,6 +2427,14 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: mixlo_zext:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: mixlo_zext:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index ef80323..fbf8011 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -179,8 +179,7 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_mad_u16_zext:
@@ -222,9 +221,9 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_mad_u16_zext64:
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index cf9a700..e6960a3 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -5,6 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
@@ -54,6 +55,13 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_sextops:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -106,6 +114,13 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_sextops_commute:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -158,6 +173,13 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_u64_u32_zextops:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -210,6 +232,13 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_u64_u32_zextops_commute:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -393,6 +422,38 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_sextops_i32_i128:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v9, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v1, v9
+; GFX1250-NEXT: v_mov_b32_e32 v21, v9
+; GFX1250-NEXT: v_mul_u64_e32 v[10:11], v[0:1], v[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_ashrrev_i32 v12, 31, v0 :: v_dual_mov_b32 v8, v11
+; GFX1250-NEXT: v_dual_ashrrev_i32 v7, 31, v6 :: v_dual_mov_b32 v13, v12
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[14:15], v12, v6, v[8:9]
+; GFX1250-NEXT: v_mul_u64_e32 v[16:17], v[6:7], v[12:13]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v8, v14
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v7, v[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v8, v15 :: v_dual_mov_b32 v20, v19
+; GFX1250-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[20:21]
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v7, v0, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[8:9], v12, v7, v[8:9]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[6:7], v[8:9], v[0:1]
+; GFX1250-NEXT: v_add_co_u32 v0, vcc_lo, v10, v2
+; GFX1250-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v18, v3, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v6, v4, vcc_lo
+; GFX1250-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v5, vcc_lo
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i128
%sext1 = sext i32 %arg1 to i128
%mul = mul i128 %sext0, %sext1
@@ -445,6 +506,13 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_sextops_i32_i63:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i63
%sext1 = sext i32 %arg1 to i63
%mul = mul i63 %sext0, %sext1
@@ -510,6 +578,16 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_sextops_i31_i63:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 31
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 31
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i31 %arg0 to i63
%sext1 = sext i31 %arg1 to i63
%mul = mul i63 %sext0, %sext1
@@ -585,6 +663,17 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_extops_i32_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v4, v[2:3]
+; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v5
+; GFX1250-NEXT: v_mad_u32 v1, v2, v4, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ext0 = sext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
%mul = mul i64 %ext0, %ext1
@@ -637,6 +726,13 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_u64_u32_bitops:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v2, v[4:5]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 4294967295
%mul = mul i64 %trunc.lhs, %trunc.rhs
@@ -711,6 +807,17 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_u64_u32_bitops_lhs_mask_small:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v2, v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX1250-NEXT: v_mad_u32 v1, v3, v2, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
%trunc.rhs = and i64 %arg1, 4294967295
%mul = mul i64 %trunc.lhs, %trunc.rhs
@@ -786,6 +893,17 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: v_and_b32_e32 v2, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_u64_u32_bitops_rhs_mask_small:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v6, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v2, v[4:5]
+; GFX1250-NEXT: v_and_b32_e32 v2, 1, v3
+; GFX1250-NEXT: v_mad_u32 v1, v6, v2, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 8589934591
%mul = mul i64 %trunc.lhs, %trunc.rhs
@@ -838,6 +956,13 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_bitops:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v2, v[4:5]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%shl.lhs = shl i64 %arg0, 32
%trunc.lhs = ashr i64 %shl.lhs, 32
%shl.rhs = shl i64 %arg1, 32
@@ -893,6 +1018,13 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_unpack_i64ops:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v1, v0, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%tmp4 = lshr i64 %arg0, 32
%tmp5 = and i64 %arg0, 4294967295
%mul = mul nuw i64 %tmp4, %tmp5
@@ -940,14 +1072,14 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mul_hi_u32 s4, s2, s3
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_add_u32 s2, s2, s6
; GFX9-NEXT: s_addc_u32 s3, s4, s7
-; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: mad_i64_i32_uniform:
@@ -982,6 +1114,25 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: mad_i64_i32_uniform:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT: s_mov_b32 s7, 0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s6, s2
+; GFX1250-NEXT: s_mov_b32 s2, s3
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3]
+; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ext0 = zext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
%mul = mul i64 %ext0, %ext1
@@ -1055,6 +1206,17 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_twice:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], v0, v1, v[2:3]
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0
+; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -1174,6 +1336,26 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_thrice:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_ashrrev_i32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v8
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[8:9]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], v[0:1], v[4:5]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[6:7]
+; GFX1250-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v3, v3, v5
+; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -1256,6 +1438,21 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i64_i32_secondary_use:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_ashrrev_i32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_ashrrev_i32_e32 v5, 31, v4
+; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[4:5]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
%mul = mul i64 %sext0, %sext1
@@ -1328,6 +1525,18 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: mad_i48_i48:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v2, v[4:5]
+; GFX1250-NEXT: v_mad_u32 v1, v7, v2, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_u32 v1, v6, v3, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%m = mul i48 %arg0, %arg1
%a = add i48 %m, %arg2
ret i48 %a
@@ -1391,6 +1600,15 @@ define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc19, v2, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, s0xfffffffffffffc19
%mad = add i64 %mul, %arg0
@@ -1456,6 +1674,15 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xd1, v2, v[0:1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xd1, v2, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, s0xffffffff000000d1
%mad = add i64 %mul, %arg0
@@ -1521,6 +1748,15 @@ define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc88, v2, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 32
%mul = mul i64 s0xfffffffffffffc88, %lsh
%mad = add i64 %mul, %arg0
@@ -1602,6 +1838,19 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mul_u64_e32 v[2:3], v[2:3], v[0:1]
+; GFX1250-NEXT: v_mov_b32_e32 v0, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc88, v3, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%ext = zext i32 %arg0 to i64
%mul1 = mul i64 %arg1, %ext
%lsh = lshr i64 %mul1, 32
@@ -1666,6 +1915,15 @@ define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_negative_1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], 0xfffffc19, v2, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 36
%mul = mul i64 %lsh, s0xfffffffffffffc19
%mad = add i64 %mul, %arg0
@@ -1729,6 +1987,16 @@ define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_negative_2:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0xd1, v1, v[0:1]
+; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_sub_nc_u32 v1, v3, v0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, s0xffffff00000000d1
%mad = add i64 %mul, %arg0
@@ -1803,6 +2071,18 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_negative_3:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1]
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = add i64 %arg0, 1
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, s0xfffffffffffffc00
@@ -1878,6 +2158,16 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_negative_4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v1, v0, v[0:1]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_u32 v1, v1, v1, v3
+; GFX1250-NEXT: v_mov_b32_e32 v0, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, %arg0
%mad = add i64 %mul, %arg0
@@ -1938,6 +2228,16 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: lshr_mad_i64_sgpr:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: s_mov_b32 s2, s1
+; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffffffff1c18)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX1250-NEXT: ; return to shader part epilog
%lsh = lshr i64 %arg0, 32
%mul = mul i64 %lsh, s0xffffffffffff1c18
%mad = add i64 %mul, %arg0
@@ -2018,6 +2318,17 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX12-NEXT: v_mov_b32_e32 v2, v6
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lshr_mad_i64_vec:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], 0xffff1c18, v1, v[0:1]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[6:7], 0xffff1118, v3, v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_sub_nc_u32 v1, v5, v1 :: v_dual_sub_nc_u32 v3, v7, v3
+; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v2, v6
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
%mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
%mad = add <2 x i64> %mul, %arg0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
new file mode 100644
index 0000000..e921f58
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @test_s_barrier() {
+; GFX10-WGP-LABEL: test_s_barrier:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_barrier
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_barrier
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_barrier
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_barrier
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_barrier_signal -1
+; GFX12-WGP-NEXT: s_barrier_wait -1
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_barrier_signal -1
+; GFX12-CU-NEXT: s_barrier_wait -1
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_alu 0xffe3
+; GFX1250-NEXT: s_barrier_signal -1
+; GFX1250-NEXT: s_barrier_wait -1
+; GFX1250-NEXT: s_endpgm
+entry:
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
+; GFX10-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT: s_barrier
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_barrier
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_barrier
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_barrier
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier_workgroup_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_barrier_signal -1
+; GFX12-WGP-NEXT: s_barrier_wait -1
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_barrier_signal -1
+; GFX12-CU-NEXT: s_barrier_wait -1
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier_workgroup_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_alu 0xffe3
+; GFX1250-NEXT: s_barrier_signal -1
+; GFX1250-NEXT: s_barrier_wait -1
+; GFX1250-NEXT: s_endpgm
+entry:
+ fence syncscope("workgroup") release
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @test_s_barrier_agent_fence() {
+; GFX10-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-WGP-NEXT: s_barrier
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: test_s_barrier_agent_fence:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT: s_barrier
+; GFX10-CU-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_barrier
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: test_s_barrier_agent_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT: s_barrier
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: test_s_barrier_agent_fence:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
+; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
+; GFX12-WGP-NEXT: s_wait_storecnt 0x0
+; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-WGP-NEXT: s_barrier_signal -1
+; GFX12-WGP-NEXT: s_barrier_wait -1
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: test_s_barrier_agent_fence:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT: s_wait_samplecnt 0x0
+; GFX12-CU-NEXT: s_wait_storecnt 0x0
+; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT: s_wait_alu 0xffe3
+; GFX12-CU-NEXT: s_barrier_signal -1
+; GFX12-CU-NEXT: s_barrier_wait -1
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: test_s_barrier_agent_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_alu 0xffe3
+; GFX1250-NEXT: s_barrier_signal -1
+; GFX1250-NEXT: s_barrier_wait -1
+; GFX1250-NEXT: s_endpgm
+entry:
+ fence syncscope("agent") release
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 80445f7..97d52d5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX6-LABEL: workgroup_acquire_fence:
@@ -78,6 +79,10 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-LABEL: workgroup_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -145,6 +150,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -217,6 +226,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -289,6 +302,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -359,6 +376,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -426,6 +447,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -498,6 +523,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -570,6 +599,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -662,6 +695,13 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -744,6 +784,14 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -842,6 +890,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -940,6 +997,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1032,6 +1098,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1114,6 +1187,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1212,6 +1293,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1310,6 +1400,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1404,6 +1503,13 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1490,6 +1596,15 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1594,6 +1709,16 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1698,6 +1823,16 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1792,6 +1927,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1878,6 +2020,15 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1982,6 +2133,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2086,6 +2247,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index 7a419a5..cc42428 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX6-LABEL: workgroup_acquire_fence:
@@ -76,6 +77,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -142,6 +148,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-LABEL: workgroup_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -208,6 +218,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-LABEL: workgroup_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -274,6 +288,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-LABEL: workgroup_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -331,6 +349,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -388,6 +410,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -445,6 +471,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -502,6 +532,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -570,6 +604,11 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -636,6 +675,10 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-LABEL: agent_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -702,6 +745,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-LABEL: agent_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -768,6 +815,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-LABEL: agent_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -825,6 +876,10 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU-LABEL: agent_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -882,6 +937,10 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-LABEL: agent_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -939,6 +998,10 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: agent_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -996,6 +1059,10 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: agent_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1064,6 +1131,11 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1130,6 +1202,10 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-LABEL: system_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1196,6 +1272,10 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-LABEL: system_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1262,6 +1342,10 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-LABEL: system_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1319,6 +1403,10 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU-LABEL: system_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1376,6 +1464,10 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-LABEL: system_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1433,6 +1525,10 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: system_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
@@ -1490,6 +1586,10 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: system_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 0e459ed..b3f6533 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@@ -65,6 +66,10 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX12-CU-LABEL: singlethread_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acquire
ret void
@@ -122,6 +127,10 @@ define amdgpu_kernel void @singlethread_release_fence() {
; GFX12-CU-LABEL: singlethread_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread") release
ret void
@@ -179,6 +188,10 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
; GFX12-CU-LABEL: singlethread_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acq_rel
ret void
@@ -236,6 +249,10 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
; GFX12-CU-LABEL: singlethread_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread") seq_cst
ret void
@@ -293,6 +310,10 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
; GFX12-CU-LABEL: singlethread_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acquire
ret void
@@ -350,6 +371,10 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
; GFX12-CU-LABEL: singlethread_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") release
ret void
@@ -407,6 +432,10 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: singlethread_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
@@ -464,6 +493,10 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: singlethread_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
@@ -521,6 +554,10 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
; GFX12-CU-LABEL: wavefront_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acquire
ret void
@@ -578,6 +615,10 @@ define amdgpu_kernel void @wavefront_release_fence() {
; GFX12-CU-LABEL: wavefront_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront") release
ret void
@@ -635,6 +676,10 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
; GFX12-CU-LABEL: wavefront_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acq_rel
ret void
@@ -692,6 +737,10 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
; GFX12-CU-LABEL: wavefront_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront") seq_cst
ret void
@@ -749,6 +798,10 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
; GFX12-CU-LABEL: wavefront_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acquire
ret void
@@ -806,6 +859,10 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
; GFX12-CU-LABEL: wavefront_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") release
ret void
@@ -863,6 +920,10 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: wavefront_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
@@ -920,6 +981,10 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: wavefront_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
@@ -998,6 +1063,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -1073,6 +1143,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -1153,6 +1228,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -1233,6 +1313,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1303,6 +1388,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acquire_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1370,6 +1459,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU-LABEL: workgroup_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1442,6 +1535,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1514,6 +1611,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1606,6 +1707,13 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire
ret void
@@ -1688,6 +1796,14 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1786,6 +1902,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -1884,6 +2009,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -1976,6 +2110,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire
ret void
@@ -2058,6 +2199,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -2156,6 +2305,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -2254,6 +2412,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -2348,6 +2515,13 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acquire
ret void
@@ -2434,6 +2608,15 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release
ret void
@@ -2538,6 +2721,16 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -2642,6 +2835,16 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -2736,6 +2939,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire
ret void
@@ -2822,6 +3032,15 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -2926,6 +3145,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -3030,6 +3259,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 07ad8cb..36adbc0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") unordered, align 4
@@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4
@@ -566,6 +589,18 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") acquire, align 4
@@ -789,6 +824,24 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
@@ -939,6 +992,16 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
@@ -1088,6 +1151,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
@@ -1261,6 +1334,20 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
@@ -1434,6 +1521,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
@@ -1583,6 +1684,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
@@ -1763,6 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -1936,6 +2059,20 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
@@ -2140,6 +2277,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -2344,6 +2497,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -2552,6 +2721,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2789,6 +2971,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -3026,6 +3227,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -3264,6 +3484,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3533,6 +3767,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3795,6 +4045,24 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4088,6 +4356,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4381,6 +4669,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4650,6 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4919,6 +5243,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5212,6 +5552,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5505,6 +5865,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5798,6 +6178,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6091,6 +6491,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6384,6 +6804,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6677,6 +7117,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6970,6 +7430,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7263,6 +7743,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7545,6 +8045,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7844,6 +8360,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8152,6 +8685,26 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8479,6 +9032,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8806,6 +9382,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9109,6 +9708,25 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9408,6 +10026,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9735,6 +10370,29 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10062,6 +10720,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10389,6 +11070,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10716,6 +11420,29 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11039,6 +11766,27 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11366,6 +12114,29 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11693,6 +12464,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12020,6 +12814,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12204,6 +13021,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4
@@ -12386,6 +13214,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4
@@ -12593,6 +13432,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4
@@ -12826,6 +13678,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
@@ -12976,6 +13847,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
@@ -13125,6 +14006,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
@@ -13298,6 +14189,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
@@ -13471,6 +14376,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
@@ -13620,6 +14539,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
@@ -13796,6 +14725,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -13969,6 +14910,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
@@ -14169,6 +15124,22 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14369,6 +15340,22 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14587,6 +15574,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -14834,6 +15835,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15081,6 +16102,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15319,6 +16360,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15584,6 +16639,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15846,6 +16917,24 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16135,6 +17224,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16424,6 +17533,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16689,6 +17818,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16954,6 +18099,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17243,6 +18404,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17532,6 +18713,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17821,6 +19022,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18110,6 +19331,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18399,6 +19640,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18688,6 +19949,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18977,6 +20258,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19266,6 +20567,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19548,6 +20869,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19857,6 +21194,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20165,6 +21520,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20502,6 +21877,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20839,6 +22238,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21152,6 +22575,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21461,6 +22904,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21798,6 +23259,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22135,6 +23620,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22472,6 +23981,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22809,6 +24342,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23142,6 +24699,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23479,6 +25058,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23816,6 +25419,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24153,6 +25780,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index a00af8e..5526b29 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_0:
@@ -16,6 +17,17 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_last_use_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
@@ -34,7 +46,6 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-NEXT: s_mov_b32 s2, 0
-; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-NEXT: v_mov_b32_e32 v2, v0
@@ -55,6 +66,21 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_last_use_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
@@ -80,6 +106,19 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
@@ -100,6 +139,17 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_last_use_and_nontemporal_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
store i32 %val, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 3c24c36..964f1c8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4, !nontemporal !0
@@ -206,7 +218,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-NEXT: s_mov_b32 s6, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: ; implicit-def: $sgpr6
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -240,7 +251,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX10-WGP-NEXT: s_mov_b32 s6, 0
-; GFX10-WGP-NEXT: ; implicit-def: $sgpr6
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -273,7 +283,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX10-CU-NEXT: s_mov_b32 s6, 0
-; GFX10-CU-NEXT: ; implicit-def: $sgpr6
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -301,7 +310,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
@@ -335,7 +343,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2
; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0
-; GFX90A-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr6
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
@@ -368,7 +375,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2
; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v2, s6, v0
; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0
-; GFX90A-TGSPLIT-NEXT: ; implicit-def: $sgpr6
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
@@ -397,7 +403,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2
; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0
; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0
-; GFX942-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr4
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v2
@@ -418,7 +423,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 2
; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s4, v0
; GFX942-TGSPLIT-NEXT: s_mov_b32 s4, 0
-; GFX942-TGSPLIT-NEXT: ; implicit-def: $sgpr4
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v2
@@ -440,7 +444,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_mov_b32 s2, 2
; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX11-WGP-NEXT: s_mov_b32 s2, 0
-; GFX11-WGP-NEXT: ; implicit-def: $sgpr2
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -470,7 +473,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_mov_b32 s2, 2
; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX11-CU-NEXT: s_mov_b32 s2, 0
-; GFX11-CU-NEXT: ; implicit-def: $sgpr2
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -501,7 +503,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
-; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -534,7 +535,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
-; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -555,6 +555,21 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -739,6 +754,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4
@@ -761,7 +787,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-NEXT: s_mov_b32 s4, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX7-NEXT: s_mov_b32 s4, 0
-; GFX7-NEXT: ; implicit-def: $sgpr4
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -793,7 +818,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-WGP-NEXT: s_mov_b32 s4, 2
; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX10-WGP-NEXT: s_mov_b32 s4, 0
-; GFX10-WGP-NEXT: ; implicit-def: $sgpr4
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -824,7 +848,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-CU-NEXT: s_mov_b32 s4, 2
; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX10-CU-NEXT: s_mov_b32 s4, 0
-; GFX10-CU-NEXT: ; implicit-def: $sgpr4
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -851,7 +874,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0
@@ -882,7 +904,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 2
; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s4, 0
-; GFX90A-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr4
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v5, v0
@@ -913,7 +934,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 2
; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e64 v4, s4, v0
; GFX90A-TGSPLIT-NEXT: s_mov_b32 s4, 0
-; GFX90A-TGSPLIT-NEXT: ; implicit-def: $sgpr4
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v5, v0
@@ -942,7 +962,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2
; GFX942-NOTTGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0
; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0
-; GFX942-NOTTGSPLIT-NEXT: ; implicit-def: $sgpr2
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v3
@@ -963,7 +982,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 2
; GFX942-TGSPLIT-NEXT: v_lshlrev_b32_e64 v0, s2, v0
; GFX942-TGSPLIT-NEXT: s_mov_b32 s2, 0
-; GFX942-TGSPLIT-NEXT: ; implicit-def: $sgpr2
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, 0
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v3
@@ -985,7 +1003,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_mov_b32 s0, 2
; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX11-WGP-NEXT: s_mov_b32 s0, 0
-; GFX11-WGP-NEXT: ; implicit-def: $sgpr0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -1014,7 +1031,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_mov_b32 s0, 2
; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX11-CU-NEXT: s_mov_b32 s0, 0
-; GFX11-CU-NEXT: ; implicit-def: $sgpr0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -1045,7 +1061,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
-; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -1078,7 +1093,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
-; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -1095,6 +1109,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1293,6 +1321,19 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index b88a10ab..871c941 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4
@@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4
@@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4
@@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4
@@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
@@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
@@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
@@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
@@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
@@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
@@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4
@@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4
@@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4
@@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
@@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
@@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
@@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
@@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20965,6 +22190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 919fc3e8..9d70a24 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in unordered, align 4
@@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in monotonic, align 4
@@ -568,6 +591,18 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in acquire, align 4
@@ -793,6 +828,24 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
@@ -943,6 +996,16 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
@@ -1092,6 +1155,16 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
@@ -1269,6 +1342,21 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
@@ -1446,6 +1534,21 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
@@ -1595,6 +1698,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
@@ -1777,6 +1890,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -1954,6 +2079,21 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
@@ -2164,6 +2304,23 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -2374,6 +2531,23 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -2584,6 +2758,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -2827,6 +3014,26 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -3070,6 +3277,26 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -3308,6 +3535,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3579,6 +3820,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3845,6 +4102,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4144,6 +4420,27 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4443,6 +4740,27 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4714,6 +5032,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4985,6 +5319,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5284,6 +5634,27 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5583,6 +5954,27 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5882,6 +6274,27 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6181,6 +6594,27 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6480,6 +6914,27 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6779,6 +7234,27 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7078,6 +7554,27 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7377,6 +7874,27 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7659,6 +8177,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7960,6 +8494,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8272,6 +8823,27 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8605,6 +9177,30 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8938,6 +9534,30 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9243,6 +9863,25 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9544,6 +10183,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9877,6 +10533,30 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10210,6 +10890,30 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10543,6 +11247,30 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10876,6 +11604,30 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11205,6 +11957,28 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11538,6 +12312,30 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11871,6 +12669,30 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12204,6 +13026,30 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12388,6 +13234,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4
@@ -12570,6 +13427,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4
@@ -12779,6 +13647,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4
@@ -13014,6 +13895,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
@@ -13164,6 +14064,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
@@ -13313,6 +14223,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
@@ -13490,6 +14410,21 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
@@ -13667,6 +14602,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
@@ -13816,6 +14766,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
@@ -13994,6 +14954,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -14171,6 +15143,21 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
@@ -14377,6 +15364,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -14583,6 +15587,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -14803,6 +15824,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -15056,6 +16091,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -15309,6 +16365,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -15547,6 +16624,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15814,6 +16905,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16080,6 +17187,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16375,6 +17501,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16670,6 +17817,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16937,6 +18105,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17204,6 +18388,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17499,6 +18699,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17794,6 +19015,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18089,6 +19331,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18384,6 +19647,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18679,6 +19963,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18974,6 +20279,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19269,6 +20595,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19564,6 +20911,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19846,6 +21214,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20157,6 +21541,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20469,6 +21871,27 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20812,6 +22235,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21155,6 +22603,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21470,6 +22943,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21781,6 +23274,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22124,6 +23635,31 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22467,6 +24003,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22810,6 +24371,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23153,6 +24739,31 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23492,6 +25103,29 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23835,6 +25469,31 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24178,6 +25837,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24521,6 +26205,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index a88e0e2..77f52e4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -7,6 +7,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -143,6 +144,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
@@ -162,7 +176,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-NEXT: s_mov_b32 s6, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: ; implicit-def: $sgpr6
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -197,7 +210,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-WGP-NEXT: s_mov_b32 s6, 2
; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX10-WGP-NEXT: s_mov_b32 s6, 0
-; GFX10-WGP-NEXT: ; implicit-def: $sgpr6
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -231,7 +243,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-CU-NEXT: s_mov_b32 s6, 2
; GFX10-CU-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX10-CU-NEXT: s_mov_b32 s6, 0
-; GFX10-CU-NEXT: ; implicit-def: $sgpr6
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -260,7 +271,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
@@ -292,7 +302,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_mov_b32 s2, 2
; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX11-WGP-NEXT: s_mov_b32 s2, 0
-; GFX11-WGP-NEXT: ; implicit-def: $sgpr2
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -323,7 +332,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_mov_b32 s2, 2
; GFX11-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX11-CU-NEXT: s_mov_b32 s2, 0
-; GFX11-CU-NEXT: ; implicit-def: $sgpr2
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -355,7 +363,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
-; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
@@ -391,7 +398,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
-; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
@@ -415,6 +421,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -563,6 +586,18 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load i32, ptr %in, align 4
@@ -585,7 +620,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-NEXT: s_mov_b32 s4, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX7-NEXT: s_mov_b32 s4, 0
-; GFX7-NEXT: ; implicit-def: $sgpr4
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v4, v0
@@ -618,7 +652,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-WGP-NEXT: s_mov_b32 s4, 2
; GFX10-WGP-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX10-WGP-NEXT: s_mov_b32 s4, 0
-; GFX10-WGP-NEXT: ; implicit-def: $sgpr4
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -650,7 +683,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-CU-NEXT: s_mov_b32 s4, 2
; GFX10-CU-NEXT: v_lshlrev_b32_e64 v3, s4, v0
; GFX10-CU-NEXT: s_mov_b32 s4, 0
-; GFX10-CU-NEXT: ; implicit-def: $sgpr4
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -678,7 +710,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v4, v0
@@ -709,7 +740,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_mov_b32 s0, 2
; GFX11-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX11-WGP-NEXT: s_mov_b32 s0, 0
-; GFX11-WGP-NEXT: ; implicit-def: $sgpr0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -739,7 +769,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_mov_b32 s0, 2
; GFX11-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX11-CU-NEXT: s_mov_b32 s0, 0
-; GFX11-CU-NEXT: ; implicit-def: $sgpr0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -771,7 +800,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
-; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
@@ -809,7 +837,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
-; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
@@ -831,6 +858,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_nontemporal_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3]
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -971,6 +1013,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4
@@ -1090,6 +1143,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 7c637a2..f086542 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4
@@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4
@@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4
@@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4
@@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
@@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
@@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
@@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
@@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
@@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
@@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4
@@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4
@@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4
@@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
@@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
@@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
@@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
@@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 0fd4aa4..d8e6ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -11,6 +11,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
@@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4
@@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4
@@ -563,6 +586,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4
@@ -776,6 +810,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
@@ -926,6 +972,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
@@ -1075,6 +1131,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
@@ -1241,6 +1307,17 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
@@ -1407,6 +1484,17 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
@@ -1556,6 +1644,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
@@ -1724,6 +1822,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -1890,6 +1999,17 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
@@ -2075,6 +2195,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2260,6 +2392,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -2465,6 +2609,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2690,6 +2846,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2915,6 +3084,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -3153,6 +3335,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3410,6 +3606,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3665,6 +3876,21 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3939,6 +4165,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4213,6 +4455,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4470,6 +4728,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4727,6 +5000,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5001,6 +5289,22 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5275,6 +5579,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5549,6 +5869,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5823,6 +6159,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6105,6 +6457,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6401,6 +6769,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6702,6 +7086,23 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7017,6 +7418,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7332,6 +7750,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7630,6 +8065,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7926,6 +8377,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8241,6 +8708,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8556,6 +9040,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8871,6 +9372,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9186,6 +9704,23 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9499,6 +10034,23 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9814,6 +10366,23 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10129,6 +10698,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10444,6 +11030,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10628,6 +11231,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4
@@ -10810,6 +11424,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4
@@ -11000,6 +11625,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4
@@ -11202,6 +11838,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -11352,6 +11999,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
@@ -11501,6 +12158,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
@@ -11660,6 +12327,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
@@ -11819,6 +12496,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -11968,6 +12655,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12127,6 +12824,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -12286,6 +12993,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
@@ -12455,6 +13172,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -12624,6 +13351,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -12825,6 +13562,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13039,6 +13788,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13253,6 +14014,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13491,6 +14264,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13739,6 +14526,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13987,6 +14788,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14245,6 +15060,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14503,6 +15332,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14751,6 +15594,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14999,6 +15856,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15257,6 +16128,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15515,6 +16400,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15773,6 +16672,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16031,6 +16944,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16289,6 +17216,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16547,6 +17488,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16805,6 +17760,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17063,6 +18032,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17345,6 +18328,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17637,6 +18636,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17931,6 +18946,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18235,6 +19266,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18539,6 +19586,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18833,6 +19896,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19125,6 +20204,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19429,6 +20524,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19733,6 +20844,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20037,6 +21164,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20341,6 +21484,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20643,6 +21802,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20947,6 +22122,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21251,6 +22442,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21555,6 +22762,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 74a72e0..184e154 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_agent_unordered_load(
; GFX6-LABEL: global_agent_unordered_load:
@@ -190,6 +191,17 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4
@@ -374,6 +386,17 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4
@@ -574,6 +597,18 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4
@@ -793,6 +828,24 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -950,6 +1003,16 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
@@ -1106,6 +1169,16 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
@@ -1287,6 +1360,20 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
@@ -1468,6 +1555,20 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
@@ -1622,6 +1723,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
@@ -1805,6 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -1984,6 +2107,20 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
@@ -2192,6 +2329,22 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -2400,6 +2553,22 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -2598,6 +2767,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -2826,6 +3008,25 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -3054,6 +3255,25 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -3273,6 +3493,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3521,6 +3755,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3765,6 +4015,24 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4038,6 +4306,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4311,6 +4599,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4559,6 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4807,6 +5131,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5080,6 +5420,26 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5353,6 +5713,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5626,6 +6006,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5899,6 +6299,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6172,6 +6592,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6445,6 +6885,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6718,6 +7178,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6991,6 +7471,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7240,6 +7740,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7507,6 +8023,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7783,6 +8316,26 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8079,6 +8632,29 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8375,6 +8951,29 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8646,6 +9245,25 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8913,6 +9531,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9209,6 +9844,29 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9505,6 +10163,29 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9801,6 +10482,29 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10097,6 +10801,29 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10389,6 +11116,27 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10685,6 +11433,29 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10981,6 +11752,29 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11277,6 +12071,29 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11463,6 +12280,17 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4
@@ -11647,6 +12475,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4
@@ -11847,6 +12686,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4
@@ -12066,6 +12917,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
@@ -12223,6 +13092,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
@@ -12379,6 +13258,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
@@ -12560,6 +13449,20 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
@@ -12741,6 +13644,20 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
@@ -12895,6 +13812,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -13078,6 +14005,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -13257,6 +14196,20 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
@@ -13465,6 +14418,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -13673,6 +14642,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -13871,6 +14856,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -14099,6 +15097,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14327,6 +15344,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14546,6 +15582,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14794,6 +15844,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15038,6 +16104,24 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15311,6 +16395,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15584,6 +16688,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15832,6 +16956,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16080,6 +17220,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16353,6 +17509,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16626,6 +17802,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16899,6 +18095,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17172,6 +18388,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17445,6 +18681,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17718,6 +18974,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17991,6 +19267,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18264,6 +19560,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18513,6 +19829,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18780,6 +20112,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19076,6 +20425,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19372,6 +20744,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19643,6 +21038,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19910,6 +21324,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20206,6 +21637,29 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20502,6 +21956,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20798,6 +22275,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21094,6 +22594,29 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21386,6 +22909,27 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21682,6 +23226,29 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21978,6 +23545,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22274,6 +23864,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index 5f952b9..ed2d623 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX12-LABEL: global_last_use_load_0:
@@ -14,6 +15,18 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_last_use_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
@@ -37,6 +50,21 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_last_use_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
@@ -58,6 +86,19 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
@@ -81,6 +122,21 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_last_use_and_nontemporal_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 16e5505..c1bfe21 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
@@ -189,6 +190,18 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0
@@ -219,7 +232,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX6-NEXT: s_mov_b32 s12, 2
; GFX6-NEXT: v_lshlrev_b32_e64 v0, s12, v0
; GFX6-NEXT: s_mov_b32 s12, 0
-; GFX6-NEXT: ; implicit-def: $sgpr12
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
@@ -239,7 +251,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX7-NEXT: s_mov_b32 s6, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: ; implicit-def: $sgpr6
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -310,7 +321,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr8
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
@@ -448,6 +458,21 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -633,6 +658,18 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_nontemporal_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
@@ -656,7 +693,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX6-NEXT: s_mov_b32 s9, 2
; GFX6-NEXT: v_lshlrev_b32_e64 v1, s9, v0
; GFX6-NEXT: s_mov_b32 s9, 0
-; GFX6-NEXT: ; implicit-def: $sgpr9
; GFX6-NEXT: v_mov_b32_e32 v0, 0
; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -677,7 +713,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX7-NEXT: s_mov_b32 s5, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0
; GFX7-NEXT: s_mov_b32 s5, 0
-; GFX7-NEXT: ; implicit-def: $sgpr5
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -736,7 +771,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
@@ -866,6 +900,20 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_nontemporal_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1056,6 +1104,19 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 8042d38..6a5a6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX6-LABEL: global_singlethread_unordered_load:
@@ -190,6 +191,17 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4
@@ -374,6 +386,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4
@@ -558,6 +581,17 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4
@@ -742,6 +776,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4
@@ -899,6 +944,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
@@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
@@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
@@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
@@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
@@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
@@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4
@@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4
@@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4
@@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
@@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
@@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
@@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index be14846..7ddd515 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_system_unordered_load(
; GFX6-LABEL: global_system_unordered_load:
@@ -190,6 +191,17 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in unordered, align 4
@@ -374,6 +386,17 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in monotonic, align 4
@@ -576,6 +599,18 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in acquire, align 4
@@ -797,6 +832,24 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
@@ -954,6 +1007,16 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
@@ -1110,6 +1173,16 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
@@ -1295,6 +1368,21 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
@@ -1480,6 +1568,21 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
@@ -1634,6 +1737,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
@@ -1819,6 +1932,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2002,6 +2127,21 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
@@ -2216,6 +2356,23 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -2430,6 +2587,23 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -2630,6 +2804,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2864,6 +3051,26 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -3098,6 +3305,26 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -3317,6 +3544,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3567,6 +3808,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3815,6 +4072,25 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4094,6 +4370,27 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4373,6 +4670,27 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4623,6 +4941,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4873,6 +5207,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5152,6 +5502,27 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5431,6 +5802,27 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5710,6 +6102,27 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5989,6 +6402,27 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6238,6 +6672,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6507,6 +6957,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6809,6 +7276,30 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7111,6 +7602,30 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7384,6 +7899,25 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7653,6 +8187,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7955,6 +8506,30 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8257,6 +8832,30 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8559,6 +9158,30 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8861,6 +9484,30 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9159,6 +9806,28 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9461,6 +10130,30 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9763,6 +10456,30 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10065,6 +10782,30 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10251,6 +10992,17 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4
@@ -10435,6 +11187,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4
@@ -10637,6 +11400,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4
@@ -10858,6 +11633,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
@@ -11015,6 +11808,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
@@ -11171,6 +11974,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
@@ -11356,6 +12169,21 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
@@ -11541,6 +12369,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
@@ -11695,6 +12538,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
@@ -11880,6 +12733,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12063,6 +12928,21 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
@@ -12277,6 +13157,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -12491,6 +13388,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -12691,6 +13605,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12925,6 +13852,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -13159,6 +14106,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -13378,6 +14345,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13628,6 +14609,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13876,6 +14873,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14155,6 +15171,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14434,6 +15471,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14684,6 +15742,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14934,6 +16008,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15213,6 +16303,27 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15492,6 +16603,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15771,6 +16903,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16050,6 +17203,27 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16329,6 +17503,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16608,6 +17803,27 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16887,6 +18103,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17166,6 +18403,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17415,6 +18673,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17684,6 +18958,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17964,6 +19255,27 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18266,6 +19578,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18568,6 +19904,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18841,6 +20201,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19110,6 +20489,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19412,6 +20808,30 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19714,6 +21134,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20016,6 +21460,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20318,6 +21786,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20616,6 +22108,28 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20918,6 +22432,30 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21220,6 +22758,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21522,6 +23084,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 8a5c5dd..1539fb5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -8,6 +8,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_volatile_load_0(
; GFX6-LABEL: global_volatile_load_0:
@@ -146,6 +147,19 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
@@ -176,7 +190,6 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX6-NEXT: s_mov_b32 s8, 2
; GFX6-NEXT: v_lshlrev_b32_e64 v0, s8, v0
; GFX6-NEXT: s_mov_b32 s8, 0
-; GFX6-NEXT: ; implicit-def: $sgpr8
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
@@ -196,7 +209,6 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX7-NEXT: s_mov_b32 s6, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s6, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: ; implicit-def: $sgpr6
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -267,7 +279,6 @@ define amdgpu_kernel void @global_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s8, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr8
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
@@ -345,6 +356,23 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -501,6 +529,19 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
@@ -524,7 +565,6 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX6-NEXT: s_mov_b32 s5, 2
; GFX6-NEXT: v_lshlrev_b32_e64 v1, s5, v0
; GFX6-NEXT: s_mov_b32 s5, 0
-; GFX6-NEXT: ; implicit-def: $sgpr5
; GFX6-NEXT: v_mov_b32_e32 v0, 0
; GFX6-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -546,7 +586,6 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX7-NEXT: s_mov_b32 s5, 2
; GFX7-NEXT: v_lshlrev_b32_e64 v1, s5, v0
; GFX7-NEXT: s_mov_b32 s5, 0
-; GFX7-NEXT: ; implicit-def: $sgpr5
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -608,7 +647,6 @@ define amdgpu_kernel void @global_volatile_store_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2
; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v1, s5, v0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0
-; SKIP-CACHE-INV-NEXT: ; implicit-def: $sgpr5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, 0
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, v0
@@ -693,6 +731,21 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -838,6 +891,17 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
@@ -969,6 +1033,17 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 151ba07..1aa8305 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX6-LABEL: global_wavefront_unordered_load:
@@ -190,6 +191,17 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4
@@ -374,6 +386,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4
@@ -558,6 +581,17 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4
@@ -742,6 +776,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4
@@ -899,6 +944,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
@@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
@@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
@@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
@@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
@@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
@@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4
@@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4
@@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4
@@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
@@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
@@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
@@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 69b0c7f..3eab16e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX6-LABEL: global_workgroup_unordered_load:
@@ -190,6 +191,17 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4
@@ -374,6 +386,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4
@@ -563,6 +586,17 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4
@@ -764,6 +798,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
@@ -921,6 +967,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
@@ -1077,6 +1133,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
@@ -1251,6 +1317,17 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
@@ -1425,6 +1502,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
@@ -1579,6 +1667,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
@@ -1743,6 +1841,16 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -1915,6 +2023,17 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
@@ -2097,6 +2216,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2279,6 +2409,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2466,6 +2607,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2674,6 +2827,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2882,6 +3048,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3101,6 +3280,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3330,6 +3523,20 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3567,6 +3774,21 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3814,6 +4036,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4061,6 +4298,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4290,6 +4542,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4519,6 +4785,20 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4766,6 +5046,21 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5013,6 +5308,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5260,6 +5570,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5507,6 +5832,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5754,6 +6094,21 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6001,6 +6356,21 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6248,6 +6618,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6495,6 +6880,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6744,6 +7144,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7000,6 +7416,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7269,6 +7701,23 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7545,6 +7994,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7821,6 +8287,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8079,6 +8562,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8335,6 +8834,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8611,6 +9126,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8887,6 +9419,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9163,6 +9712,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9439,6 +10005,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9713,6 +10296,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9989,6 +10589,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10265,6 +10882,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10541,6 +11175,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10727,6 +11378,17 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4
@@ -10911,6 +11573,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4
@@ -11100,6 +11773,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4
@@ -11297,6 +11981,17 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -11454,6 +12149,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
@@ -11610,6 +12315,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -11776,6 +12491,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
@@ -11942,6 +12667,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12096,6 +12831,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12260,6 +13005,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -12424,6 +13179,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
@@ -12598,6 +13363,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -12772,6 +13547,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -12959,6 +13744,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13159,6 +13956,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13359,6 +14168,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13578,6 +14399,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13807,6 +14642,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14036,6 +14885,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14275,6 +15138,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14514,6 +15391,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14743,6 +15634,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14972,6 +15877,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15211,6 +16130,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15450,6 +16383,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15689,6 +16636,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15928,6 +16889,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16167,6 +17142,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16406,6 +17395,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16645,6 +17648,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16884,6 +17901,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17133,6 +18164,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17389,6 +18436,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17650,6 +18713,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17918,6 +18997,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18186,6 +19281,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18444,6 +19555,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18700,6 +19827,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18968,6 +20111,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19236,6 +20395,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19504,6 +20679,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19772,6 +20963,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20038,6 +21245,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20306,6 +21529,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20574,6 +21813,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20842,6 +22097,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 0467c50..102616b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_agent_unordered_load(
; GFX6-LABEL: local_agent_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_agent_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_agent_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_agent_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_agent_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 78209ee..c6f7ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
@@ -193,6 +194,18 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ds_load_b32 v1, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0
@@ -428,6 +441,22 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 2
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT: ds_load_b32 v1, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -597,6 +626,18 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_nontemporal_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
@@ -802,6 +843,22 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_nontemporal_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s1, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1
+; GFX1250-NEXT: s_mov_b32 s1, 2
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -991,6 +1048,18 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ds_load_b32 v1, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index f84d451..1800acb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX6-LABEL: local_singlethread_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") monotonic, align 4
@@ -519,6 +544,18 @@ define amdgpu_kernel void @local_singlethread_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") acquire, align 4
@@ -690,6 +727,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") seq_cst, align 4
@@ -831,6 +880,16 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") unordered, align 4
@@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") monotonic, align 4
@@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_singlethread_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") release, align 4
@@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") seq_cst, align 4
@@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") monotonic
@@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire
@@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") release
@@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel
@@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire
@@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") unordered, align 4
@@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") monotonic, align 4
@@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") acquire, align 4
@@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") unordered, align 4
@@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") release, align 4
@@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") release
@@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 74a2972..1356fe4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_system_unordered_load(
; GFX6-LABEL: local_system_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_system_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_system_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_system_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_system_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_system_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_system_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 5e5e3bf..75e28f9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -8,6 +8,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_volatile_load_0(
; GFX6-LABEL: local_volatile_load_0:
@@ -141,6 +142,18 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ds_load_b32 v1, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(3) %in, align 4
@@ -308,6 +321,22 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 2
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT: ds_load_b32 v1, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -429,6 +458,18 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_store_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%val = load i32, ptr addrspace(1) %in, align 4
@@ -570,6 +611,22 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_store_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s1, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1
+; GFX1250-NEXT: s_mov_b32 s1, 2
+; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -698,6 +755,18 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
@@ -813,6 +882,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index b24622a..7e345ed 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX6-LABEL: local_wavefront_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") monotonic, align 4
@@ -519,6 +544,18 @@ define amdgpu_kernel void @local_wavefront_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") acquire, align 4
@@ -690,6 +727,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") seq_cst, align 4
@@ -831,6 +880,16 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") unordered, align 4
@@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4
@@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_wavefront_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") release, align 4
@@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 4
@@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") monotonic
@@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire
@@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") release
@@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel
@@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire
@@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") unordered, align 4
@@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") monotonic, align 4
@@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") acquire, align 4
@@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") unordered, align 4
@@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") release, align 4
@@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") release
@@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 62d7f48..6aaf9d3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -12,6 +12,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX6-LABEL: local_workgroup_unordered_load:
@@ -177,6 +178,18 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4
@@ -348,6 +361,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4
@@ -524,6 +549,18 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4
@@ -718,6 +755,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -859,6 +909,16 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4
@@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4
@@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic
@@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire
@@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4
@@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4
@@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4
@@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4
@@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4
@@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release
@@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
index 3751ae1..d9a6eb6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@@ -90,8 +90,8 @@ frameInfo:
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
- savePoint: ''
- restorePoint: ''
+ savePoint: []
+ restorePoint: []
fixedStack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
isImmutable: false, isAliased: false, callee-saved-register: '' }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
index 4eec06b..8e6d948 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
@@ -70,8 +70,8 @@ frameInfo:
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
- savePoint: ''
- restorePoint: ''
+ savePoint: []
+ restorePoint: []
fixedStack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
isImmutable: false, isAliased: false, callee-saved-register: '' }
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
new file mode 100644
index 0000000..4ca0cc9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -0,0 +1,23434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_agent_unordered_load(
+; GFX6-LABEL: private_agent_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_load(
+; GFX6-LABEL: private_agent_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_load(
+; GFX6-LABEL: private_agent_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_load(
+; GFX6-LABEL: private_agent_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_unordered_store(
+; GFX6-LABEL: private_agent_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_store(
+; GFX6-LABEL: private_agent_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_store(
+; GFX6-LABEL: private_agent_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_store(
+; GFX6-LABEL: private_agent_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_atomicrmw(
+; GFX6-LABEL: private_agent_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_atomicrmw(
+; GFX6-LABEL: private_agent_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_atomicrmw(
+; GFX6-LABEL: private_agent_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") release
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_atomicrmw(
+; GFX6-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_atomicrmw(
+; GFX6-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_unordered_load(
+; GFX6-LABEL: private_agent_one_as_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent-one-as") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_load(
+; GFX6-LABEL: private_agent_one_as_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent-one-as") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_load(
+; GFX6-LABEL: private_agent_one_as_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent-one-as") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_load(
+; GFX6-LABEL: private_agent_one_as_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("agent-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_unordered_store(
+; GFX6-LABEL: private_agent_one_as_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent-one-as") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_store(
+; GFX6-LABEL: private_agent_one_as_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent-one-as") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_store(
+; GFX6-LABEL: private_agent_one_as_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent-one-as") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_store(
+; GFX6-LABEL: private_agent_one_as_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("agent-one-as") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("agent-one-as") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
new file mode 100644
index 0000000..e9ee6b4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
@@ -0,0 +1,23329 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_singlethread_unordered_load(
+; GFX6-LABEL: private_singlethread_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_load(
+; GFX6-LABEL: private_singlethread_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_load(
+; GFX6-LABEL: private_singlethread_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_load(
+; GFX6-LABEL: private_singlethread_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_unordered_store(
+; GFX6-LABEL: private_singlethread_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_store(
+; GFX6-LABEL: private_singlethread_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_store(
+; GFX6-LABEL: private_singlethread_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_store(
+; GFX6-LABEL: private_singlethread_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw(
+; GFX6-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_atomicrmw(
+; GFX6-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_atomicrmw(
+; GFX6-LABEL: private_singlethread_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") release
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw(
+; GFX6-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw(
+; GFX6-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_unordered_load(
+; GFX6-LABEL: private_singlethread_one_as_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread-one-as") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_load(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread-one-as") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_load(
+; GFX6-LABEL: private_singlethread_one_as_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread-one-as") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("singlethread-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_unordered_store(
+; GFX6-LABEL: private_singlethread_one_as_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread-one-as") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_store(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread-one-as") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_store(
+; GFX6-LABEL: private_singlethread_one_as_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread-one-as") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("singlethread-one-as") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("singlethread-one-as") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
new file mode 100644
index 0000000..24ec3a3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
@@ -0,0 +1,22387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_system_unordered_load(
+; GFX6-LABEL: private_system_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_load(
+; GFX6-LABEL: private_system_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_load(
+; GFX6-LABEL: private_system_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_load(
+; GFX6-LABEL: private_system_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_unordered_store(
+; GFX6-LABEL: private_system_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_store(
+; GFX6-LABEL: private_system_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_release_store(
+; GFX6-LABEL: private_system_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_store(
+; GFX6-LABEL: private_system_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_atomicrmw(
+; GFX6-LABEL: private_system_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_atomicrmw(
+; GFX6-LABEL: private_system_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_release_atomicrmw(
+; GFX6-LABEL: private_system_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in release
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_atomicrmw(
+; GFX6-LABEL: private_system_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_atomicrmw(
+; GFX6-LABEL: private_system_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_release_acquire_cmpxchg(
+; GFX6-LABEL: private_system_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_unordered_load(
+; GFX6-LABEL: private_system_one_as_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("one-as") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_load(
+; GFX6-LABEL: private_system_one_as_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("one-as") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_load(
+; GFX6-LABEL: private_system_one_as_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("one-as") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_load(
+; GFX6-LABEL: private_system_one_as_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_unordered_store(
+; GFX6-LABEL: private_system_one_as_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("one-as") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_store(
+; GFX6-LABEL: private_system_one_as_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("one-as") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_store(
+; GFX6-LABEL: private_system_one_as_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("one-as") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_store(
+; GFX6-LABEL: private_system_one_as_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("one-as") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_atomicrmw(
+; GFX6-LABEL: private_system_one_as_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("one-as") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_bvhcnt 0x0
+; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
new file mode 100644
index 0000000..8b22544
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
@@ -0,0 +1,23329 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_wavefront_unordered_load(
+; GFX6-LABEL: private_wavefront_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_load(
+; GFX6-LABEL: private_wavefront_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_load(
+; GFX6-LABEL: private_wavefront_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_load(
+; GFX6-LABEL: private_wavefront_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_unordered_store(
+; GFX6-LABEL: private_wavefront_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_store(
+; GFX6-LABEL: private_wavefront_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_store(
+; GFX6-LABEL: private_wavefront_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_store(
+; GFX6-LABEL: private_wavefront_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw(
+; GFX6-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_atomicrmw(
+; GFX6-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_atomicrmw(
+; GFX6-LABEL: private_wavefront_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") release
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw(
+; GFX6-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw(
+; GFX6-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_unordered_load(
+; GFX6-LABEL: private_wavefront_one_as_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront-one-as") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_load(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront-one-as") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_load(
+; GFX6-LABEL: private_wavefront_one_as_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront-one-as") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("wavefront-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_unordered_store(
+; GFX6-LABEL: private_wavefront_one_as_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront-one-as") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_store(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront-one-as") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_store(
+; GFX6-LABEL: private_wavefront_one_as_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront-one-as") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("wavefront-one-as") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("wavefront-one-as") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
new file mode 100644
index 0000000..127434c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
@@ -0,0 +1,23375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s
+
+define amdgpu_kernel void @private_workgroup_unordered_load(
+; GFX6-LABEL: private_workgroup_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_load(
+; GFX6-LABEL: private_workgroup_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_load(
+; GFX6-LABEL: private_workgroup_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_load(
+; GFX6-LABEL: private_workgroup_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_unordered_store(
+; GFX6-LABEL: private_workgroup_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_store(
+; GFX6-LABEL: private_workgroup_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_store(
+; GFX6-LABEL: private_workgroup_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_store(
+; GFX6-LABEL: private_workgroup_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw(
+; GFX6-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_atomicrmw(
+; GFX6-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_atomicrmw(
+; GFX6-LABEL: private_workgroup_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") release
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw(
+; GFX6-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw(
+; GFX6-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_unordered_load(
+; GFX6-LABEL: private_workgroup_one_as_unordered_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_unordered_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_unordered_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_unordered_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_unordered_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_unordered_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_unordered_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_unordered_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_unordered_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_unordered_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup-one-as") unordered, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_load(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup-one-as") monotonic, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_load(
+; GFX6-LABEL: private_workgroup_one_as_acquire_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup-one-as") acquire, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_load:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %in, ptr addrspace(5) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(5) %in syncscope("workgroup-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(5) %out
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_unordered_store(
+; GFX6-LABEL: private_workgroup_one_as_unordered_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_unordered_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_unordered_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_unordered_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_unordered_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_unordered_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_unordered_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_unordered_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_unordered_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_unordered_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup-one-as") unordered, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_store(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup-one-as") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_store(
+; GFX6-LABEL: private_workgroup_one_as_release_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup-one-as") release, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_store:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ i32 %in, ptr addrspace(5) %out) {
+entry:
+ store atomic i32 %in, ptr addrspace(5) %out syncscope("workgroup-one-as") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") release
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") acq_rel
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s1, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s1
+; GFX1250-NEXT: s_mov_b32 s1, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s1, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s1, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
+; GFX1250-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s1, v2, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s1, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") acquire
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") acq_rel
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: s_waitcnt vmcnt(1)
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(1)
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(1)
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[2:3], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s3, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s0, s3
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in) {
+entry:
+ %val = atomicrmw volatile xchg ptr addrspace(5) %out, i32 %in syncscope("workgroup-one-as") seq_cst
+ store i32 %val, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX6-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX6-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX6-NEXT: s_mov_b32 s7, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s4, s4, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
+; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
+; GFX7-NEXT: s_mov_b32 s7, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s4, s4, s7
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v0, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s0, s0, s3
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x4
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s0, s0, s3
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s0
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s3, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s0, s0, s3
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s3, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s0, s0, s3
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v0, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 offset:16
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s3, s2, s3
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s2
+; GFX1250-NEXT: s_mov_b32 s2, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s2, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[4:5], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[4:5], 0
+; GFX1250-NEXT: s_mov_b32 s2, s5
+; GFX1250-NEXT: s_mov_b32 s6, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s3, s6
+; GFX1250-NEXT: s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s2, v2, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s2, s4
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s2, v0, s3
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: v_mov_b32_e32 v4, s0
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
+; GFX6-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_add_u32 s0, s0, s15
+; GFX6-NEXT: s_addc_u32 s1, s1, 0
+; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX6-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX6-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX6-NEXT: s_mov_b32 s5, 16
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_add_i32 s5, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s5
+; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v2, s5
+; GFX6-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s4
+; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX6-NEXT: s_endpgm
+;
+; GFX7-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_u32 s0, s0, s17
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
+; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX7-NEXT: s_load_dword s8, s[6:7], 0x1
+; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
+; GFX7-NEXT: s_mov_b32 s5, 16
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: s_add_i32 s5, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s5
+; GFX7-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v2, s5
+; GFX7-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_mov_b32_e32 v1, s4
+; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
+; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX10-WGP-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-WGP-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-WGP-NEXT: s_endpgm
+;
+; GFX10-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
+; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT: v_cmp_eq_u32_e64 s6, v0, s6
+; GFX10-CU-NEXT: v_cndmask_b32_e64 v1, v0, s5, s6
+; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4
+; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-CU-NEXT: s_endpgm
+;
+; SKIP-CACHE-INV-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; SKIP-CACHE-INV: ; %bb.0: ; %entry
+; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13]
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0
+; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
+; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
+; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
+; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
+; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x1
+; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
+; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, 16
+; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
+; SKIP-CACHE-INV-NEXT: s_add_i32 s1, s0, s1
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
+; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
+; SKIP-CACHE-INV-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s4
+; SKIP-CACHE-INV-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s1
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
+; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
+; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX90A-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX90A-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x4
+; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[6:7]
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX942-NOTTGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NOTTGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-NOTTGSPLIT-NEXT: s_nop 0
+; GFX942-NOTTGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX942-TGSPLIT-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX942-TGSPLIT: ; %bb.0: ; %entry
+; GFX942-TGSPLIT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT: s_load_dword s4, s[2:3], 0x4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
+; GFX942-TGSPLIT-NEXT: s_mov_b32 s1, 16
+; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-TGSPLIT-NEXT: s_add_i32 s1, s0, s1
+; GFX942-TGSPLIT-NEXT: scratch_load_dword v0, off, s1
+; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX942-TGSPLIT-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s2
+; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX942-TGSPLIT-NEXT: s_nop 0
+; GFX942-TGSPLIT-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3]
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v1, s1
+; GFX942-TGSPLIT-NEXT: scratch_store_dword off, v0, s0
+; GFX942-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-WGP-NEXT: s_mov_b32 s1, 16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_add_i32 s1, s0, s1
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-WGP-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-CU-NEXT: s_mov_b32 s1, 16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_add_i32 s1, s0, s1
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_cmp_eq_u32_e64 s3, v0, s3
+; GFX11-CU-NEXT: v_cndmask_b32_e64 v1, v0, s2, s3
+; GFX11-CU-NEXT: scratch_store_b32 off, v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_endpgm
+;
+; GFX12-WGP-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-WGP: ; %bb.0: ; %entry
+; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
+; GFX12-WGP-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
+; GFX12-WGP-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xf1ff
+; GFX12-WGP-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-WGP-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-WGP-NEXT: s_endpgm
+;
+; GFX12-CU-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX12-CU: ; %bb.0: ; %entry
+; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT: s_wait_kmcnt 0x0
+; GFX12-CU-NEXT: scratch_load_b32 v0, off, s0 offset:16
+; GFX12-CU-NEXT: s_wait_loadcnt 0x0
+; GFX12-CU-NEXT: v_cmp_eq_u32_e64 s2, v0, s2
+; GFX12-CU-NEXT: s_wait_alu 0xf1ff
+; GFX12-CU-NEXT: v_cndmask_b32_e64 v1, v0, s1, s2
+; GFX12-CU-NEXT: scratch_store_b32 off, v1, s0 offset:16
+; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_mov_b32 s3, 16
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s4, s0, s3
+; GFX1250-NEXT: s_mov_b32 s3, 0
+; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, s3
+; GFX1250-NEXT: s_mov_b32 s3, 20
+; GFX1250-NEXT: v_lshlrev_b32_e64 v2, s3, v0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_mov_b64 s[6:7], src_flat_scratch_base_lo
+; GFX1250-NEXT: v_add_nc_u64_e64 v[0:1], v[0:1], s[6:7]
+; GFX1250-NEXT: v_mov_b32_e32 v2, v1
+; GFX1250-NEXT: s_mov_b64 s[6:7], 0
+; GFX1250-NEXT: s_mov_b32 s3, s7
+; GFX1250-NEXT: s_mov_b32 s5, -1
+; GFX1250-NEXT: s_cmp_lg_u32 s4, s5
+; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT: v_cndmask_b32_e64 v2, s3, v2, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: s_mov_b32 s3, s6
+; GFX1250-NEXT: v_cndmask_b32_e64 v0, s3, v0, s4
+; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s1
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(5) %out, i32 %in, i32 %old) {
+entry:
+ %gep = getelementptr i32, ptr addrspace(5) %out, i32 4
+ %val = cmpxchg volatile ptr addrspace(5) %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
+ %val0 = extractvalue { i32, i1 } %val, 0
+ store i32 %val0, ptr addrspace(5) %out, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/merge-flat-saddr-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-saddr-load-store.mir
new file mode 100644
index 0000000..1c133c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-saddr-load-store.mir
@@ -0,0 +1,338 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: merge_flat_load_dword_saddr_2
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_load_dword_saddr_2
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_SADDR]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_SADDR]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name: merge_flat_load_dword_saddr_3
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_load_dword_saddr_3
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = FLAT_LOAD_DWORDX3_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX3_SADDR]].sub0_sub1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX3_SADDR]].sub2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3, implicit %4
+...
+
+---
+name: merge_flat_load_dword_saddr_4
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_load_dword_saddr_4
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 2, implicit $exec, implicit $flat_scr :: (load (s128) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_SADDR]].sub3
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %5:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5
+...
+
+---
+name: merge_flat_load_dword_saddr_6
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_load_dword_saddr_6
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 4, 3, implicit $exec, implicit $flat_scr :: (load (s128) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_SADDR]].sub0_sub1_sub2
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_SADDR]].sub3
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 20, 3, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_SADDR]].sub0
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_SADDR]].sub1
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %5:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %6:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %7:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
+...
+
+---
+name: merge_flat_load_dwordx2_saddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_load_dwordx2_saddr
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_SADDR:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX4_SADDR]].sub0_sub1
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_SADDR]].sub2_sub3
+ ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vreg_64_align2 = FLAT_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vreg_64_align2 = FLAT_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name: no_merge_flat_load_dword_and_flat_load_dword_saddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_load_dword_and_flat_load_dword_saddr
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD_SADDR]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name: no_merge_flat_load_dword_saddr_different_saddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_load_dword_saddr_different_saddr
+ ; GCN: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[DEF]].sub0_sub1, [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[DEF]].sub2_sub3, [[DEF1]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD_SADDR]], implicit [[FLAT_LOAD_DWORD_SADDR1]]
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3
+...
+
+---
+name: no_merge_flat_load_dword_saddr_different_vaddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_load_dword_saddr_different_vaddr
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[DEF]], [[DEF1]].sub1, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD_SADDR]], implicit [[FLAT_LOAD_DWORD_SADDR1]]
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %3:vgpr_32 = FLAT_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+ S_NOP 0, implicit %2, implicit %3
+...
+---
+name: merge_flat_store_dword_saddr_2
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_store_dword_saddr_2
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF2]], %subreg.sub0, [[DEF3]], %subreg.sub1
+ ; GCN-NEXT: FLAT_STORE_DWORDX2_SADDR [[DEF1]], killed [[REG_SEQUENCE]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: merge_flat_store_dword_saddr_3
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_store_dword_saddr_3
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF2]], %subreg.sub0, [[DEF3]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[DEF4]], %subreg.sub2
+ ; GCN-NEXT: FLAT_STORE_DWORDX3_SADDR [[DEF1]], killed [[REG_SEQUENCE1]], [[DEF]], 4, 1, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: merge_flat_store_dword_saddr_4
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_store_dword_saddr_4
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF2]], %subreg.sub0, [[DEF3]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[DEF4]], %subreg.sub2
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[DEF5]], %subreg.sub3
+ ; GCN-NEXT: FLAT_STORE_DWORDX4_SADDR [[DEF1]], killed [[REG_SEQUENCE2]], [[DEF]], 4, 2, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: merge_flat_store_dword_saddr_6
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: merge_flat_store_dword_saddr_6
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF2]], %subreg.sub0, [[DEF3]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[DEF4]], %subreg.sub2
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[DEF5]], %subreg.sub3
+ ; GCN-NEXT: FLAT_STORE_DWORDX4_SADDR [[DEF1]], killed [[REG_SEQUENCE2]], [[DEF]], 4, 3, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF6]], %subreg.sub0, [[DEF7]], %subreg.sub1
+ ; GCN-NEXT: FLAT_STORE_DWORDX2_SADDR [[DEF1]], killed [[REG_SEQUENCE3]], [[DEF]], 20, 3, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = IMPLICIT_DEF
+ %5:vgpr_32 = IMPLICIT_DEF
+ %6:vgpr_32 = IMPLICIT_DEF
+ %7:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: no_merge_flat_store_dword_saddr_with_flat_store_dword
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_store_dword_saddr_with_flat_store_dword
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: FLAT_STORE_DWORD_SADDR [[DEF1]].sub0, [[DEF2]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: FLAT_STORE_DWORD [[DEF1]], [[DEF3]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD %1, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: no_merge_flat_store_dword_saddr_different_vaddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_store_dword_saddr_different_vaddr
+ ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: FLAT_STORE_DWORD_SADDR [[DEF1]].sub0, [[DEF2]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: FLAT_STORE_DWORD_SADDR [[DEF1]].sub1, [[DEF3]], [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ %0:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %1:vreg_64_align2 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
+
+---
+name: no_merge_flat_store_dword_saddr_different_saddr
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: no_merge_flat_store_dword_saddr_different_saddr
+ ; GCN: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: FLAT_STORE_DWORD_SADDR [[DEF1]], [[DEF2]], [[DEF]].sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: FLAT_STORE_DWORD_SADDR [[DEF1]], [[DEF3]], [[DEF]].sub2_sub3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+ %0:sgpr_128 = IMPLICIT_DEF
+ %1:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ FLAT_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+ FLAT_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
index 4b4ec30..0e9c021 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir
@@ -1,93 +1,113 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-# GCN-LABEL: name: ds_read_b32_v_v
-# GCN: vreg_64_align2 = DS_READ2_B32
+---
name: ds_read_b32_v_v
body: |
bb.0:
+ ; GCN-LABEL: name: ds_read_b32_v_v
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub1
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
%2:vgpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
...
-
-# GCN-LABEL: name: ds_read_b32_a_a
-# GCN: areg_64_align2 = DS_READ2_B32
+---
name: ds_read_b32_a_a
body: |
bb.0:
+ ; GCN-LABEL: name: ds_read_b32_a_a
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:areg_64_align2 = DS_READ2_B32_gfx9 [[DEF]], 0, 2, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:agpr_32 = COPY [[DS_READ2_B32_gfx9_]].sub0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY killed [[DS_READ2_B32_gfx9_]].sub1
%0:vgpr_32 = IMPLICIT_DEF
%1:agpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
%2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_read_b32_v_a
-# GCN: vgpr_32 = DS_READ_B32
-# GCN: agpr_32 = DS_READ_B32
+---
name: ds_read_b32_v_a
body: |
bb.0:
+ ; GCN-LABEL: name: ds_read_b32_v_a
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
%2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_read_b32_a_v
-# GCN: agpr_32 = DS_READ_B32
-# GCN: vgpr_32 = DS_READ_B32
+---
name: ds_read_b32_a_v
body: |
bb.0:
+ ; GCN-LABEL: name: ds_read_b32_a_v
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:agpr_32 = DS_READ_B32_gfx9 [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
%1:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
%2:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_write_b32_v_v
-# GCN: DS_WRITE2_B32_gfx9 %0, undef %1:vgpr_32, undef %2:vgpr_32
+---
name: ds_write_b32_v_v
body: |
bb.0:
+ ; GCN-LABEL: name: ds_write_b32_v_v
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: DS_WRITE2_B32_gfx9 [[DEF]], undef %1:vgpr_32, undef %2:vgpr_32, 0, 2, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_write_b32_a_a
-# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32
-# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32
+---
name: ds_write_b32_a_a
body: |
bb.0:
+ ; GCN-LABEL: name: ds_write_b32_a_a
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %1:agpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %2:agpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_write_b32_v_a
-# GCN: DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32
-# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32
+---
name: ds_write_b32_v_a
body: |
bb.0:
+ ; GCN-LABEL: name: ds_write_b32_v_a
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %2:agpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
...
-# GCN-LABEL: name: ds_write_b32_a_v
-# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32
-# GCN: DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32
+---
name: ds_write_b32_a_v
body: |
bb.0:
+ ; GCN-LABEL: name: ds_write_b32_a_v
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %1:agpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
+ ; GCN-NEXT: DS_WRITE_B32_gfx9 [[DEF]], undef %2:vgpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`, addrspace 3)
%0:vgpr_32 = IMPLICIT_DEF
DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(3) poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
index 62cc565..f5407a5 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer-gfx11.mir
@@ -1525,3 +1525,621 @@ body: |
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
...
+---
+
+name: gfx11_tbuffer_load_x_x_x_idxen_16bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0,$sgpr1,$sgpr2,$sgpr3,$vgpr0
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_16bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY]], %rsrc, 0, 0, 57, 0, 0, implicit $exec :: (dereferenceable load (s48), align 2, addrspace 8)
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: %x2:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub2
+ ; GFX11-NEXT: %x0:vgpr_32 = COPY [[COPY1]].sub0
+ ; GFX11-NEXT: %x1:vgpr_32 = COPY killed [[COPY1]].sub1
+ %0:vgpr_32 = COPY $vgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1,%subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ %x0:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %0, %rsrc, 0, 0, 13, 0, 0, implicit $exec :: (dereferenceable load (s16),align 2,addrspace 8)
+ %x1:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %0, %rsrc, 0, 2, 13, 0, 0, implicit $exec :: (dereferenceable load (s16),align 2,addrspace 8)
+ %x2:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %0, %rsrc, 0, 4, 13, 0, 0, implicit $exec :: (dereferenceable load (s16),align 2,addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_idxen_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_idxen_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 0, 57, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1_sub2
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub3
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 16, 57, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1]].sub0_sub1_sub2
+ ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1]].sub3
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[COPY12]].sub0_sub1
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub2
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY14]].sub0
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY killed [[COPY14]].sub1
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 24, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vgpr_32 = COPY %4:sgpr_32
+ %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 0, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 2, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 4, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 6, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 16, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %17:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 18, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %19:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 20, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %21:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 22, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %22:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 24, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_uint_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_uint_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ ; GFX11-NEXT: %idx:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN %idx, %rsrc, 0, 0, 55, 0, 0, implicit $exec :: (dereferenceable load (s64), align 2, addrspace 4)
+ ; GFX11-NEXT: %v0:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: %v1:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+ %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1,%subreg.sub1, $sgpr2,%subreg.sub2, $sgpr3,%subreg.sub3
+ %idx:vgpr_32 = COPY $vgpr0
+ %v0:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %idx, %rsrc, 0, 0, 27, 0, 0, implicit $exec :: (dereferenceable load (s32),align 2,addrspace 4)
+ %v1:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %idx, %rsrc, 0, 4, 27, 0, 0, implicit $exec :: (dereferenceable load (s32),align 2,addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_sint_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_sint_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ ; GFX11-NEXT: %idx:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN %idx, %rsrc, 0, 0, 56, 0, 0, implicit $exec :: (dereferenceable load (s64), align 2, addrspace 4)
+ ; GFX11-NEXT: %v0:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: %v1:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+ %rsrc:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1,%subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3,%subreg.sub3
+ %idx:vgpr_32 = COPY $vgpr0
+ %v0:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %idx, %rsrc, 0, 0, 28, 0, 0, implicit $exec :: (dereferenceable load (s32),align 2,addrspace 4)
+ %v1:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %idx, %rsrc, 0, 4, 28, 0, 0, implicit $exec :: (dereferenceable load (s32),align 2,addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_off2_off4_16bit_no_merge
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_x_off2_off4_16bit_no_merge
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 2, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 4, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vgpr_32 = COPY %4:sgpr_32
+ %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 2, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 4, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_store_x_x_x_idxen_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_x_idxen_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE1]], %rsrc, 0, 0, 57, 0, 0, implicit $exec :: (store (s48), align 2, addrspace 4)
+ %4:vgpr_32 = COPY $vgpr0
+ %5:vgpr_32 = COPY $vgpr1
+ %6:vgpr_32 = COPY $vgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2,%subreg.sub2,%3, %subreg.sub3
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %rsrc, 0, 0, 13, 0, 0, implicit $exec :: (store (s16),align 2,addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %rsrc, 0, 2, 13, 0, 0, implicit $exec :: (store (s16),align 2,addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %rsrc, 0, 4, 13, 0, 0, implicit $exec :: (store (s16),align 2,addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_idxen_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_idxen_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[COPY6]], %subreg.sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY5]], %subreg.sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], %rsrc, 0, 0, 57, 0, 0, implicit $exec :: (store (s64), align 2, addrspace 4)
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE4]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE5]], %rsrc, 0, 8, 57, 0, 0, implicit $exec :: (store (s64), align 2, addrspace 4)
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], %rsrc, 0, 16, 13, 0, 0, implicit $exec :: (store (s16), addrspace 4)
+ %12:vgpr_32 = COPY $vgpr8
+ %11:vgpr_32 = COPY $vgpr7
+ %10:vgpr_32 = COPY $vgpr6
+ %9:vgpr_32 = COPY $vgpr5
+ %8:vgpr_32 = COPY $vgpr4
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %rsrc, 0, 0, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %rsrc, 0, 2, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %rsrc, 0, 4, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %rsrc, 0, 6, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %rsrc, 0, 8, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %9, %rsrc, 0, 10, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %10, %rsrc, 0, 12, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %11, %rsrc, 0, 14, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %12, %rsrc, 0, 16, 13, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_xy_uint_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy_uint_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE $vgpr0, %subreg.sub0, $vgpr1, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE $vgpr2, %subreg.sub0, $vgpr3, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 0, 55, 0, 0, implicit $exec :: (store (s64), align 2, addrspace 4)
+ %0:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3,%subreg.sub3
+ %1:vreg_64 = REG_SEQUENCE $vgpr0, %subreg.sub0, $vgpr1, %subreg.sub1
+ %2:vreg_64 = REG_SEQUENCE $vgpr2, %subreg.sub0, $vgpr3, %subreg.sub1
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %1, %0, 0, 0, 27, 0, 0, implicit $exec :: (store (s32),align 2,addrspace 4)
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %2, %0, 0, 4, 27, 0, 0, implicit $exec :: (store (s32),align 2,addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_xy_sint_16_bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy_sint_16_bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE $vgpr0, %subreg.sub0, $vgpr1, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE $vgpr2, %subreg.sub0, $vgpr3, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 0, 56, 0, 0, implicit $exec :: (store (s64), align 2, addrspace 4)
+ %0:sgpr_128 = REG_SEQUENCE $sgpr0, %subreg.sub0, $sgpr1, %subreg.sub1, $sgpr2, %subreg.sub2, $sgpr3, %subreg.sub3
+ %1:vreg_64 = REG_SEQUENCE $vgpr0, %subreg.sub0, $vgpr1, %subreg.sub1
+ %2:vreg_64 = REG_SEQUENCE $vgpr2, %subreg.sub0, $vgpr3, %subreg.sub1
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %1, %0, 0, 0, 28, 0, 0, implicit $exec :: (store (s32),align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %2, %0, 0, 4, 28, 0, 0, implicit $exec :: (store (s32),align 2, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_load_x_x_x_idxen_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 0, 46, 0, 0, implicit $exec :: (dereferenceable load (s24), align 1, addrspace 8)
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub2
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]].sub0
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub1
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vgpr_32 = COPY %4:sgpr_32
+ %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 0, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 1, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 2, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_idxen_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_idxen_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 0, 46, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1_sub2
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub3
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 4, 46, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1]].sub0_sub1_sub2
+ ; GFX11-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN1]].sub3
+ ; GFX11-NEXT: [[COPY14:%[0-9]+]]:vreg_64 = COPY [[COPY12]].sub0_sub1
+ ; GFX11-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub2
+ ; GFX11-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY14]].sub0
+ ; GFX11-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY killed [[COPY14]].sub1
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 8, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vgpr_32 = COPY %4:sgpr_32
+ %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 0, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 1, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 2, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 3, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 4, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %17:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 5, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %19:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 6, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %21:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 7, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %22:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 8, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_uint_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_uint_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 0, 43, 0, 0, implicit $exec :: (dereferenceable load (s32), align 2, addrspace 8)
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ %6:vgpr_32 = COPY %4
+ %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %6, %5, 0, 0, 15, 0, 0, implicit $exec :: (dereferenceable load (s16), align 2, addrspace 8)
+ %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %6, %5, 0, 2, 15, 0, 0, implicit $exec :: (dereferenceable load (s16), align 2, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_xy_xy_idxen_sint_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_sint_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 0, 47, 0, 0, implicit $exec :: (dereferenceable load (s32), align 2, addrspace 8)
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ %6:vgpr_32 = COPY %4
+ %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %6, %5, 0, 0, 19, 0, 0, implicit $exec :: (dereferenceable load (s16), align 2, addrspace 8)
+ %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN %6, %5, 0, 2, 19, 0, 0, implicit $exec :: (dereferenceable load (s16), align 2, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_load_x_off3_off4_8bit_no_merge
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-LABEL: name: gfx11_tbuffer_load_x_off3_off4_8bit_no_merge
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 3, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY5]], [[REG_SEQUENCE]], 0, 4, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 8)
+ %4:sgpr_32 = COPY $sgpr4
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vgpr_32 = COPY %4:sgpr_32
+ %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 3, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+ %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN %8:vgpr_32, %5:sgpr_128, 0, 4, 5, 0, 0, implicit $exec :: (dereferenceable load (s8), align 1, addrspace 8)
+...
+---
+
+name: gfx11_tbuffer_store_x_x_x_idxen_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_x_idxen_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE1]], %rsrc, 0, 0, 46, 0, 0, implicit $exec :: (store (s24), align 1, addrspace 4)
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %rsrc, 0, 0, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %rsrc, 0, 1, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %rsrc, 0, 2, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_idxen_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_idxen_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[COPY6]], %subreg.sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY5]], %subreg.sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], %rsrc, 0, 0, 46, 0, 0, implicit $exec :: (store (s32), align 1, addrspace 4)
+ ; GFX11-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2
+ ; GFX11-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE4]], %subreg.sub0_sub1_sub2, [[COPY1]], %subreg.sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE5]], %rsrc, 0, 4, 46, 0, 0, implicit $exec :: (store (s32), align 1, addrspace 4)
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], %rsrc, 0, 8, 5, 0, 0, implicit $exec :: (store (s8), addrspace 4)
+ %12:vgpr_32 = COPY $vgpr8
+ %11:vgpr_32 = COPY $vgpr7
+ %10:vgpr_32 = COPY $vgpr6
+ %9:vgpr_32 = COPY $vgpr5
+ %8:vgpr_32 = COPY $vgpr4
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %rsrc, 0, 0, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %rsrc, 0, 1, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %rsrc, 0, 2, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %rsrc, 0, 3, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %rsrc, 0, 4, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %9, %rsrc, 0, 5, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %10, %rsrc, 0, 6, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %11, %rsrc, 0, 7, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %12, %rsrc, 0, 8, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_xy_idxen_uint_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy_idxen_uint_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: %v0:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: %v1:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: %v2:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: %v3:vgpr_32 = COPY $vgpr3
+ ; GFX11-NEXT: %s3:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: %s2:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: %s1:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: %s0:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE %s0, %subreg.sub0, %s1, %subreg.sub1, %s2, %subreg.sub2, %s3, %subreg.sub3
+ ; GFX11-NEXT: %xy0:vreg_64 = REG_SEQUENCE %v0, %subreg.sub0, %v1, %subreg.sub1
+ ; GFX11-NEXT: %xy1:vreg_64 = REG_SEQUENCE %v2, %subreg.sub0, %v3, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE %xy0, %subreg.sub0_sub1, %xy1, %subreg.sub2_sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE]], %rsrc, 0, 0, 43, 0, 0, implicit $exec :: (store (s32), align 2, addrspace 4)
+ %v0:vgpr_32 = COPY $vgpr0
+ %v1:vgpr_32 = COPY $vgpr1
+ %v2:vgpr_32 = COPY $vgpr2
+ %v3:vgpr_32 = COPY $vgpr3
+ %s3:sgpr_32 = COPY $sgpr3
+ %s2:sgpr_32 = COPY $sgpr2
+ %s1:sgpr_32 = COPY $sgpr1
+ %s0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %s0, %subreg.sub0, %s1, %subreg.sub1, %s2, %subreg.sub2, %s3, %subreg.sub3
+ %xy0:vreg_64 = REG_SEQUENCE %v0, %subreg.sub0, %v1, %subreg.sub1
+ %xy1:vreg_64 = REG_SEQUENCE %v2, %subreg.sub0, %v3, %subreg.sub1
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %xy0, %rsrc, 0, 0, 15, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %xy1, %rsrc, 0, 2, 15, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_xy_xy_idxen_sint_8bit
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy_idxen_sint_8bit
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: %v0:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: %v1:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: %v2:vgpr_32 = COPY $vgpr2
+ ; GFX11-NEXT: %v3:vgpr_32 = COPY $vgpr3
+ ; GFX11-NEXT: %s3:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: %s2:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: %s1:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: %s0:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE %s0, %subreg.sub0, %s1, %subreg.sub1, %s2, %subreg.sub2, %s3, %subreg.sub3
+ ; GFX11-NEXT: %xy0:vreg_64 = REG_SEQUENCE %v0, %subreg.sub0, %v1, %subreg.sub1
+ ; GFX11-NEXT: %xy1:vreg_64 = REG_SEQUENCE %v2, %subreg.sub0, %v3, %subreg.sub1
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE %xy0, %subreg.sub0_sub1, %xy1, %subreg.sub2_sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE]], %rsrc, 0, 0, 42, 0, 0, implicit $exec :: (store (s32), align 2, addrspace 4)
+ %v0:vgpr_32 = COPY $vgpr0
+ %v1:vgpr_32 = COPY $vgpr1
+ %v2:vgpr_32 = COPY $vgpr2
+ %v3:vgpr_32 = COPY $vgpr3
+ %s3:sgpr_32 = COPY $sgpr3
+ %s2:sgpr_32 = COPY $sgpr2
+ %s1:sgpr_32 = COPY $sgpr1
+ %s0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %s0, %subreg.sub0, %s1, %subreg.sub1, %s2, %subreg.sub2, %s3, %subreg.sub3
+ %xy0:vreg_64 = REG_SEQUENCE %v0, %subreg.sub0, %v1, %subreg.sub1
+ %xy1:vreg_64 = REG_SEQUENCE %v2, %subreg.sub0, %v3, %subreg.sub1
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %xy0, %rsrc, 0, 0, 14, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+ TBUFFER_STORE_FORMAT_XY_OFFSET_exact %xy1, %rsrc, 0, 2, 14, 0, 0, implicit $exec :: (store (s16), align 2, addrspace 4)
+...
+---
+
+name: gfx11_tbuffer_store_x_off3_off4_8bit_no_merge
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1
+ ; GFX11-LABEL: name: gfx11_tbuffer_store_x_off3_off4_8bit_no_merge
+ ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1
+ ; GFX11-NEXT: {{ $}}
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: %rsrc:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], %rsrc, 0, 3, 5, 0, 0, implicit $exec :: (store (s8), addrspace 4)
+ ; GFX11-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], %rsrc, 0, 4, 5, 0, 0, implicit $exec :: (store (s8), addrspace 4)
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %rsrc:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %rsrc, 0, 3, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %rsrc, 0, 4, 5, 0, 0, implicit $exec :: (store (s8), align 1, addrspace 4)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 6110b31..d39daaa 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -708,103 +708,72 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX908-LABEL: test_mfma_loop_unfoldable_seq:
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: v_mov_b32_e32 v0, 0x431a0000
-; GFX908-NEXT: s_mov_b32 s0, 16
-; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43190000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43160000
; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43170000
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v2
; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v1
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43140000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43130000
; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43110000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43100000
; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x430e0000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x430d0000
; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x430b0000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x430a0000
; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43080000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43070000
; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43050000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43040000
; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x43020000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x43010000
; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x42fe0000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x42fc0000
; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mov_b32_e32 v1, 0x42f80000
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x42f60000
; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: s_mov_b32 s0, 16
; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
; GFX908-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_nop 1
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 3702f32..57e6943 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -7,6 +7,10 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-GFX12,SDAG-GFX12-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-GFX12,GISEL-GFX12-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX1250,SDAG,SDAG-GFX1250,SDAG-GFX1250-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX1250,SDAG,SDAG-GFX1250,SDAG-GFX1250-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX1250,GISEL,GISEL-GFX1250,GISEL-GFX1250-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX1250,GISEL,GISEL-GFX1250,GISEL-GFX1250-FAKE16 %s
define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX11-LABEL: test_minmax_i32:
@@ -24,6 +28,13 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_minmax_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
ret i32 %sminmax
@@ -71,6 +82,13 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_i32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_minmax_commuted_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_maxmin_i32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
%sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
ret i32 %sminmax
@@ -92,6 +110,13 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_maxmin_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
ret i32 %smaxmin
@@ -113,6 +138,13 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_i32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_maxmin_commuted_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_minmax_i32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
%smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
ret i32 %smaxmin
@@ -136,6 +168,14 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX12-NEXT: v_med3_i32 v2, v2, v3, v4
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_smed3_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_med3_i32 v2, v2, v3, v4
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.smin.i32(i32 %tmp1, i32 %z)
@@ -160,6 +200,13 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_minmax_u32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
ret i32 %uminmax
@@ -207,6 +254,13 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_u32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_minmax_commuted_u32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_maxmin_u32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
%uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
ret i32 %uminmax
@@ -228,6 +282,13 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_maxmin_u32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
ret i32 %umaxmin
@@ -249,6 +310,13 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_u32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_maxmin_commuted_u32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_minmax_u32 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
%umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
ret i32 %umaxmin
@@ -272,6 +340,14 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
; GFX12-NEXT: v_med3_u32 v2, v2, v3, v4
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_umed3_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_med3_u32 v2, v2, v3, v4
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
%tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
%tmp2 = call i32 @llvm.umin.i32(i32 %tmp1, i32 %z)
@@ -320,6 +396,24 @@ define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
; GISEL-GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1250-LABEL: test_minmax_f32_ieee_true:
+; SDAG-GFX1250: ; %bb.0:
+; SDAG-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX1250-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; SDAG-GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-LABEL: test_minmax_f32_ieee_true:
+; GISEL-GFX1250: ; %bb.0:
+; GISEL-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX1250-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; GISEL-GFX1250-NEXT: s_set_pc_i64 s[30:31]
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %max, float %c)
ret float %minmax
@@ -363,6 +457,26 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[6:7]
; GISEL-GFX12-NEXT: s_endpgm
+;
+; SDAG-GFX1250-LABEL: s_test_minmax_f32_ieee_false:
+; SDAG-GFX1250: ; %bb.0:
+; SDAG-GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX1250-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX1250-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX1250-NEXT: v_maxmin_num_f32 v0, s0, s1, v0
+; SDAG-GFX1250-NEXT: global_store_b32 v1, v0, s[4:5]
+; SDAG-GFX1250-NEXT: s_endpgm
+;
+; GISEL-GFX1250-LABEL: s_test_minmax_f32_ieee_false:
+; GISEL-GFX1250: ; %bb.0:
+; GISEL-GFX1250-NEXT: s_max_num_f32 s0, s0, s1
+; GISEL-GFX1250-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX1250-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX1250-NEXT: s_min_num_f32 s0, s0, s2
+; GISEL-GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX1250-NEXT: global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX1250-NEXT: s_endpgm
%smax = call float @llvm.maxnum.f32(float %a, float %b)
%sminmax = call float @llvm.minnum.f32(float %smax, float %c)
store float %sminmax, ptr addrspace(1) %out
@@ -379,6 +493,11 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
; GFX12: ; %bb.0:
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_minmax_commuted_f32_ieee_false:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_maxmin_num_f32 v0, v0, v1, v2
+; GFX1250-NEXT: ; return to shader part epilog
%max = call float @llvm.maxnum.f32(float %a, float %b)
%minmax = call float @llvm.minnum.f32(float %c, float %max)
ret float %minmax
@@ -424,6 +543,24 @@ define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
; GISEL-GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
; GISEL-GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1250-LABEL: test_maxmin_f32_ieee_true:
+; SDAG-GFX1250: ; %bb.0:
+; SDAG-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; SDAG-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2
+; SDAG-GFX1250-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; SDAG-GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-LABEL: test_maxmin_f32_ieee_true:
+; GISEL-GFX1250: ; %bb.0:
+; GISEL-GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GISEL-GFX1250-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GISEL-GFX1250-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; GISEL-GFX1250-NEXT: s_set_pc_i64 s[30:31]
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %min, float %c)
ret float %maxmin
@@ -439,6 +576,11 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
; GFX12: ; %bb.0:
; GFX12-NEXT: v_minmax_num_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: test_maxmin_commuted_f32_ieee_false:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_minmax_num_f32 v0, v0, v1, v2
+; GFX1250-NEXT: ; return to shader part epilog
%min = call float @llvm.minnum.f32(float %a, float %b)
%maxmin = call float @llvm.maxnum.f32(float %c, float %min)
ret float %maxmin
@@ -462,6 +604,14 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
; GFX12-NEXT: v_med3_num_f32 v2, v2, v3, v4
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: test_med3_f32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_med3_num_f32 v2, v2, v3, v4
+; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%tmp0 = call float @llvm.minnum.f32(float %x, float %y)
%tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
%tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
@@ -536,6 +686,26 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
; GISEL-GFX12-FAKE16: ; %bb.0:
; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
+; SDAG-GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX1250-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX1250-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %max, half %c)
ret half %minmax
@@ -620,6 +790,47 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
; GISEL-GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v1, v0, s[6:7]
; GISEL-GFX12-FAKE16-NEXT: s_endpgm
+;
+; SDAG-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; SDAG-GFX1250-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-GFX1250-TRUE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX1250-TRUE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, s0, s1, v0.l
+; SDAG-GFX1250-TRUE16-NEXT: flat_store_b16 v1, v0, s[4:5]
+; SDAG-GFX1250-TRUE16-NEXT: s_endpgm
+;
+; SDAG-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; SDAG-GFX1250-FAKE16-NEXT: s_mov_b32 s5, s4
+; SDAG-GFX1250-FAKE16-NEXT: s_mov_b32 s4, s3
+; SDAG-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, s0, s1, v0
+; SDAG-GFX1250-FAKE16-NEXT: global_store_b16 v1, v0, s[4:5]
+; SDAG-GFX1250-FAKE16-NEXT: s_endpgm
+;
+; GISEL-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: s_max_num_f16 s0, s0, s1
+; GISEL-GFX1250-TRUE16-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX1250-TRUE16-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX1250-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX1250-TRUE16-NEXT: s_min_num_f16 s0, s0, s2
+; GISEL-GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX1250-TRUE16-NEXT: flat_store_b16 v1, v0, s[6:7]
+; GISEL-GFX1250-TRUE16-NEXT: s_endpgm
+;
+; GISEL-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: s_max_num_f16 s0, s0, s1
+; GISEL-GFX1250-FAKE16-NEXT: s_mov_b32 s6, s3
+; GISEL-GFX1250-FAKE16-NEXT: s_mov_b32 s7, s4
+; GISEL-GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-GFX1250-FAKE16-NEXT: s_min_num_f16 s0, s0, s2
+; GISEL-GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v1, v0, s[6:7]
+; GISEL-GFX1250-FAKE16-NEXT: s_endpgm
%smax = call half @llvm.maxnum.f16(half %a, half %b)
%sminmax = call half @llvm.minnum.f16(half %smax, half %c)
store half %sminmax, ptr addrspace(1) %out
@@ -714,6 +925,46 @@ define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
; GISEL-GFX12-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1250-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; SDAG-GFX1250-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; SDAG-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-TRUE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX1250-TRUE16-NEXT: v_maxmin_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-FAKE16-LABEL: test_minmax_commuted_f16_ieee_true:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX1250-FAKE16-NEXT: v_maxmin_num_f16 v0, v0, v1, v2
+; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%minmax = call half @llvm.minnum.f16(half %c, half %max)
ret half %minmax
@@ -759,6 +1010,26 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
; GISEL-GFX12-FAKE16: ; %bb.0:
; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
; GISEL-GFX12-FAKE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
+; SDAG-GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; SDAG-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX1250-FAKE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
+; GISEL-GFX1250-TRUE16-NEXT: ; return to shader part epilog
+;
+; GISEL-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX1250-FAKE16-NEXT: ; return to shader part epilog
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %min, half %c)
ret half %maxmin
@@ -852,6 +1123,46 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
; GISEL-GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
; GISEL-GFX12-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1250-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; SDAG-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; SDAG-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; SDAG-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; SDAG-GFX1250-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; SDAG-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; SDAG-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; SDAG-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-TRUE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GISEL-GFX1250-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l
+; GISEL-GFX1250-TRUE16-NEXT: v_minmax_num_f16 v0.l, v0.l, v0.h, v1.l
+; GISEL-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-FAKE16-LABEL: test_maxmin_commuted_f16_ieee_true:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GISEL-GFX1250-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2
+; GISEL-GFX1250-FAKE16-NEXT: v_minmax_num_f16 v0, v0, v1, v2
+; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%maxmin = call half @llvm.maxnum.f16(half %c, half %min)
ret half %maxmin
@@ -929,6 +1240,38 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
; GISEL-GFX12-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
; GISEL-GFX12-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1250-TRUE16-LABEL: test_med3_f16:
+; SDAG-GFX1250-TRUE16: ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
+; SDAG-GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v2
+; SDAG-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; SDAG-GFX1250-FAKE16-LABEL: test_med3_f16:
+; SDAG-GFX1250-FAKE16: ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; SDAG-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; SDAG-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-TRUE16-LABEL: test_med3_f16:
+; GISEL-GFX1250-TRUE16: ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-TRUE16-NEXT: v_med3_num_f16 v2.l, v2.l, v3.l, v4.l
+; GISEL-GFX1250-TRUE16-NEXT: flat_store_b16 v[0:1], v2
+; GISEL-GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-GFX1250-FAKE16-LABEL: test_med3_f16:
+; GISEL-GFX1250-FAKE16: ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GISEL-GFX1250-FAKE16-NEXT: v_med3_num_f16 v2, v2, v3, v4
+; GISEL-GFX1250-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GISEL-GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
%tmp0 = call half @llvm.minnum.f16(half %x, half %y)
%tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
%tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
@@ -946,4 +1289,3 @@ declare half @llvm.maxnum.f16(half, half)
declare float @llvm.minnum.f32(float, float)
declare float @llvm.maxnum.f32(float, float)
attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll
index 4449978..f66b575 100644
--- a/llvm/test/CodeGen/AMDGPU/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmra.ll
@@ -11,15 +11,13 @@ define void @fence_loads(ptr %ptr) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
- ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4)
+ ; CHECK-NEXT: ATOMIC_FENCE 5, 1, mmra !0
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
+ ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2
- ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2
+ ; CHECK-NEXT: FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
; CHECK-NEXT: SI_RETURN
fence release, !mmra !0
%ld = load atomic i8, ptr %ptr acquire, align 4, !mmra !2
@@ -34,11 +32,9 @@ define void @atomicrmw_acq(ptr %ptr) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
- ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
+ ; CHECK-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE killed [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr)
; CHECK-NEXT: SI_RETURN
%old.2 = atomicrmw add ptr %ptr, i8 0 acquire, !mmra !2
ret void
@@ -52,15 +48,11 @@ define void @atomicrmw_rel(ptr %ptr) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
@@ -69,8 +61,8 @@ define void @atomicrmw_rel(ptr %ptr) {
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !2
- ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !2
+ ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (load (s32) from %ir.AlignedAddr)
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.atomicrmw.start:
@@ -80,12 +72,10 @@ define void @atomicrmw_rel(ptr %ptr) {
; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %6, %bb.1
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_NOT_B32_e32_]], [[V_LSHLREV_B32_e64_1]], implicit $exec
; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2
- ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr)
- ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2
+ ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr)
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
@@ -106,15 +96,11 @@ define void @cmpxchg(ptr %ptr) {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
@@ -125,27 +111,25 @@ define void @cmpxchg(ptr %ptr) {
; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 killed [[V_LSHLREV_B32_e64_1]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 1
; CHECK-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_3]], implicit $exec
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !1
- ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !1
+ ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load (s32) from %ir.AlignedAddr)
; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[FLAT_LOAD_DWORD]], [[V_NOT_B32_e32_]], implicit $exec
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.partword.cmpxchg.loop:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %12, %bb.3
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.0, %12, %bb.3
; CHECK-NEXT: [[PHI1:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %13, %bb.3
; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[V_AND_B32_e64_2]], %bb.0, %11, %bb.3
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI2]], [[V_LSHLREV_B32_e64_2]], implicit $exec
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1
- ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr)
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1
+ ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr)
; CHECK-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI2]], implicit $exec
; CHECK-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
- ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[PHI]], $exec, implicit-def $scc
; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
@@ -163,7 +147,7 @@ define void @cmpxchg(ptr %ptr) {
; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.1(0x7c000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[S_OR_B64_]], %bb.1, [[S_OR_B64_1]], %bb.2
- ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[DEF7]], %bb.1, [[V_AND_B32_e64_3]], %bb.2
+ ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[V_AND_B32_e64_3]], %bb.2
; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[PHI3]]
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK [[COPY7]], [[PHI1]], implicit-def dead $scc
@@ -199,8 +183,6 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -238,8 +220,6 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) {
; CHECK-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY29]], killed [[S_MOV_B32_]], implicit $exec
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY11]], %subreg.sub1
; CHECK-NEXT: [[COPY30:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
@@ -248,8 +228,8 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) {
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
; CHECK-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
; CHECK-NEXT: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec
- ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !0
- ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr)
+ ; CHECK-NEXT: [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !0
+ ; CHECK-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr)
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.atomicrmw.start:
@@ -259,12 +239,10 @@ define void @atomicrmw_rel_deepcopy(ptr %ptr) {
; CHECK-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %6, %bb.1
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_NOT_B32_e32_]], [[V_LSHLREV_B32_e64_1]], implicit $exec
; CHECK-NEXT: [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1
- ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !0
- ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr)
- ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0
+ ; CHECK-NEXT: [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !0
+ ; CHECK-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr)
+ ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0
; CHECK-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc
; CHECK-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
index a7b4ba8..b524a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
@@ -13,7 +13,6 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
@@ -34,8 +33,6 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
@@ -59,7 +56,6 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -28744524
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
@@ -82,7 +78,6 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE]], [[GLOBAL_LOAD_DWORDX2_SADDR]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
@@ -103,8 +98,6 @@ define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index fcc5584..2d73a6b 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -47,7 +47,7 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1
atomic:
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100
- %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst
+ %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
store i32 %ret, ptr addrspace(1) %out
br label %exit
@@ -87,7 +87,7 @@ define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrs
atomic:
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100
- %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst
+ %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0
br label %exit
exit:
@@ -96,3 +96,5 @@ exit:
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
index d19318c..11de739 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
@@ -11,18 +11,16 @@ body: |
; GFX8-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX8-NEXT: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GFX8-NEXT: [[V_LSHL_ADD_U64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_ADD_U64_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec
- ; GFX8-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; GFX8-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX8-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 [[DEF4]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
+ ; GFX8-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX8-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 [[DEF3]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
;
; GFX12-LABEL: name: lshlrev_b64
; GFX12: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GFX12-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; GFX12-NEXT: [[V_LSHL_ADD_U64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_ADD_U64_e64 [[DEF]], [[DEF1]], [[DEF2]], implicit $exec
- ; GFX12-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[V_LSHLREV_B64_pseudo_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_pseudo_e64 [[DEF4]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
+ ; GFX12-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[V_LSHLREV_B64_pseudo_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_pseudo_e64 [[DEF3]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
%0:vreg_64 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_64 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
index f7fb4a6..6c18400 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
@@ -10,9 +10,7 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -31,9 +29,7 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -52,9 +48,7 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -73,9 +67,7 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
@@ -94,9 +86,7 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index d6b0958..6e8cbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -10,7 +10,6 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
@@ -30,7 +29,6 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
@@ -50,7 +48,6 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
@@ -70,7 +67,6 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
@@ -90,7 +86,6 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
index 0f4715f..10d3636 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
@@ -16,14 +16,6 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3, [[COPY3]], %subreg.sub4, [[COPY2]], %subreg.sub5, [[COPY1]], %subreg.sub6, [[COPY]], %subreg.sub7
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
@@ -78,14 +70,6 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3, [[COPY3]], %subreg.sub4, [[COPY2]], %subreg.sub5, [[COPY1]], %subreg.sub6, [[COPY]], %subreg.sub7
; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
@@ -149,14 +133,6 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr
; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX11-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3, [[COPY7]], %subreg.sub4, [[COPY6]], %subreg.sub5, [[COPY5]], %subreg.sub6, [[COPY4]], %subreg.sub7
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -216,14 +192,6 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr
; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF7:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3, [[COPY7]], %subreg.sub4, [[COPY6]], %subreg.sub5, [[COPY5]], %subreg.sub6, [[COPY4]], %subreg.sub7
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -289,10 +257,6 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3
; GFX11-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX11-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3, [[COPY7]], %subreg.sub4, [[COPY6]], %subreg.sub5, [[COPY5]], %subreg.sub6, [[COPY4]], %subreg.sub7
- ; GFX11-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
@@ -342,10 +306,6 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3
; GFX12-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
; GFX12-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[COPY11]], %subreg.sub0, [[COPY10]], %subreg.sub1, [[COPY9]], %subreg.sub2, [[COPY8]], %subreg.sub3, [[COPY7]], %subreg.sub4, [[COPY6]], %subreg.sub5, [[COPY5]], %subreg.sub6, [[COPY4]], %subreg.sub7
- ; GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX12-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
; GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index e12fe97..cf0fbe4 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -147,10 +147,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64-O0-NEXT: v_mov_b32_e32 v4, v3
; W64-O0-NEXT: v_mov_b32_e32 v5, v2
; W64-O0-NEXT: v_mov_b32_e32 v6, v1
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v6
; W64-O0-NEXT: v_mov_b32_e32 v2, v5
@@ -160,7 +156,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; W64-O0-NEXT: s_mov_b32 s4, 0
; W64-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
; W64-O0-NEXT: v_writelane_b32 v7, s4, 0
@@ -505,10 +500,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; W64-O0-NEXT: v_mov_b32_e32 v2, v0
; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v14, v5
; W64-O0-NEXT: s_waitcnt vmcnt(3)
@@ -520,10 +511,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v3, v8
; W64-O0-NEXT: v_mov_b32_e32 v4, v7
@@ -533,26 +520,18 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v2, v12
; W64-O0-NEXT: s_waitcnt vmcnt(9)
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v10
; W64-O0-NEXT: s_waitcnt vmcnt(10)
; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; W64-O0-NEXT: s_mov_b32 s4, 0
; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane
; W64-O0-NEXT: v_writelane_b32 v17, s4, 0
@@ -1032,18 +1011,10 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: v_mov_b32_e32 v11, v1
; W64-O0-NEXT: v_mov_b32_e32 v5, v0
; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v8
; W64-O0-NEXT: v_mov_b32_e32 v2, v6
; W64-O0-NEXT: v_mov_b32_e32 v3, v7
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v6, v11
; W64-O0-NEXT: v_mov_b32_e32 v7, v10
@@ -1053,8 +1024,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v5, v12
; W64-O0-NEXT: s_waitcnt vmcnt(6)
@@ -1067,7 +1036,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; W64-O0-NEXT: ;;#ASMSTART
; W64-O0-NEXT: s_mov_b32 s4, 17
; W64-O0-NEXT: ;;#ASMEND
@@ -1176,7 +1144,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
; W64-O0-NEXT: v_writelane_b32 v13, s5, 12
; W64-O0-NEXT: v_mov_b32_e32 v0, s4
; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
; W64-O0-NEXT: s_mov_b64 s[4:5], exec
; W64-O0-NEXT: v_writelane_b32 v13, s4, 13
; W64-O0-NEXT: v_writelane_b32 v13, s5, 14
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 3d3c59f..6368030 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -145,22 +145,14 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; W64-O0-NEXT: v_mov_b32_e32 v5, v2
; W64-O0-NEXT: v_mov_b32_e32 v2, v1
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v6, v3
; W64-O0-NEXT: v_mov_b32_e32 v4, v6
; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v2
; W64-O0-NEXT: v_mov_b32_e32 v6, v1
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v6
; W64-O0-NEXT: v_mov_b32_e32 v2, v5
@@ -170,8 +162,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; W64-O0-NEXT: s_mov_b32 s4, 0
; W64-O0-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
; W64-O0-NEXT: v_writelane_b32 v7, s4, 0
@@ -515,23 +505,15 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; W64-O0-NEXT: v_mov_b32_e32 v2, v0
; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; W64-O0-NEXT: s_waitcnt vmcnt(2)
; W64-O0-NEXT: v_mov_b32_e32 v15, v5
; W64-O0-NEXT: v_mov_b32_e32 v5, v15
; W64-O0-NEXT: v_mov_b32_e32 v6, v14
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v14, v8
; W64-O0-NEXT: v_mov_b32_e32 v8, v14
; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v14, v8
; W64-O0-NEXT: v_mov_b32_e32 v15, v6
@@ -541,22 +523,14 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v8, v3
; W64-O0-NEXT: v_mov_b32_e32 v6, v8
; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v3, v4
; W64-O0-NEXT: v_mov_b32_e32 v8, v3
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v3, v8
; W64-O0-NEXT: v_mov_b32_e32 v4, v7
@@ -566,28 +540,18 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v2, v12
; W64-O0-NEXT: s_waitcnt vmcnt(9)
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v10
; W64-O0-NEXT: s_waitcnt vmcnt(10)
; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; W64-O0-NEXT: s_mov_b32 s4, 0
; W64-O0-NEXT: ; implicit-def: $vgpr17 : SGPR spill to VGPR lane
; W64-O0-NEXT: v_writelane_b32 v17, s4, 0
@@ -1069,22 +1033,14 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; W64-O0-NEXT: v_mov_b32_e32 v8, v0
; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v10, v3
; W64-O0-NEXT: v_mov_b32_e32 v3, v10
; W64-O0-NEXT: v_mov_b32_e32 v5, v9
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v9, v6
; W64-O0-NEXT: v_mov_b32_e32 v6, v9
; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v9, v6
; W64-O0-NEXT: v_mov_b32_e32 v10, v5
@@ -1094,17 +1050,11 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v5, v7
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: s_waitcnt vmcnt(5)
; W64-O0-NEXT: v_mov_b32_e32 v3, v1
-; W64-O0-NEXT: ; implicit-def: $sgpr4
-; W64-O0-NEXT: ; implicit-def: $sgpr4
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v12
; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -1113,8 +1063,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; W64-O0-NEXT: s_waitcnt vmcnt(8)
; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; W64-O0-NEXT: s_nop 0
@@ -1232,10 +1180,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
; W64-O0-NEXT: v_mov_b32_e32 v0, v3
; W64-O0-NEXT: v_mov_b32_e32 v4, v2
; W64-O0-NEXT: v_mov_b32_e32 v5, v1
-; W64-O0-NEXT: ; implicit-def: $sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr5
-; W64-O0-NEXT: ; implicit-def: $sgpr5
; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
; W64-O0-NEXT: v_mov_b32_e32 v1, v6
; W64-O0-NEXT: v_mov_b32_e32 v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index f9dd736..6ef1574 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -36,10 +36,6 @@ body: |
; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
@@ -81,10 +77,6 @@ body: |
; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
@@ -151,10 +143,6 @@ body: |
; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
@@ -196,10 +184,6 @@ body: |
; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
@@ -266,10 +250,6 @@ body: |
; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NEXT: {{ $}}
@@ -311,10 +291,6 @@ body: |
; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
@@ -380,10 +356,6 @@ body: |
; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
@@ -407,10 +379,6 @@ body: |
; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
@@ -460,10 +428,6 @@ body: |
; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
@@ -486,10 +450,6 @@ body: |
; W64-NO-ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W64-NO-ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W64-NO-ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W64-NO-ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
; W64-NO-ADDR64-NEXT: {{ $}}
@@ -531,10 +491,6 @@ body: |
; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
; W32-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
new file mode 100644
index 0000000..6d0aa1e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %val4, <16 x i64> %val16) {
+; CHECK-LABEL: no_folding_imm_to_inst_with_fi:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: s_load_b256 s[36:43], s[4:5], 0x24
+; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4
+; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4
+; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base
+; CHECK-NEXT: s_movk_i32 s33, 0x70
+; CHECK-NEXT: s_movk_i32 s34, 0x60
+; CHECK-NEXT: s_or_b32 s44, 0x80, s33
+; CHECK-NEXT: s_mov_b32 s45, s35
+; CHECK-NEXT: s_or_b32 s46, 0x80, s34
+; CHECK-NEXT: s_mov_b32 s47, s35
+; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45
+; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47
+; CHECK-NEXT: s_movk_i32 s34, 0x80
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41
+; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37
+; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
+; CHECK-NEXT: s_movk_i32 s20, 0x50
+; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
+; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_or_b32 s20, 0x80, s20
+; CHECK-NEXT: s_mov_b32 s21, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
+; CHECK-NEXT: s_or_b32 s16, 0x80, 64
+; CHECK-NEXT: s_mov_b32 s17, s35
+; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
+; CHECK-NEXT: s_or_b32 s12, 0x80, 48
+; CHECK-NEXT: s_mov_b32 s13, s35
+; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; CHECK-NEXT: s_or_b32 s8, 0x80, 32
+; CHECK-NEXT: s_mov_b32 s9, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
+; CHECK-NEXT: s_or_b32 s4, 0x80, 16
+; CHECK-NEXT: s_mov_b32 s5, s35
+; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12
+; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8
+; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
+; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_endpgm
+bb:
+ %alloca = alloca <4 x i64>, align 32, addrspace(5)
+ %alloca1 = alloca <16 x i64>, align 128, addrspace(5)
+ store volatile <4 x i64> %val4, ptr addrspace(5) %alloca
+ %ascast = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ store volatile <16 x i64> %val16, ptr %ascast
+ %load = load volatile <16 x i64>, ptr %ascast
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index d95fc77..a3c38b1 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -85,7 +85,7 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:1
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 1
+ %gep = getelementptr inbounds i8, ptr %p, i64 1
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -160,7 +160,7 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 2047
+ %gep = getelementptr inbounds i8, ptr %p, i64 2047
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -235,7 +235,7 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 4095
+ %gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -330,7 +330,7 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 8191
+ %gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -425,7 +425,7 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 8388607
+ %gep = getelementptr inbounds i8, ptr %p, i64 8388607
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -511,7 +511,7 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -2048
+ %gep = getelementptr inbounds i8, ptr %p, i64 -2048
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -597,7 +597,7 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -4096
+ %gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -683,7 +683,7 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -8192
+ %gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -769,7 +769,7 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -8388608
+ %gep = getelementptr inbounds i8, ptr %p, i64 -8388608
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -845,7 +845,7 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 4095
+ %gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -940,7 +940,7 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 8191
+ %gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -1035,7 +1035,7 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:16383
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 16383
+ %gep = getelementptr inbounds i8, ptr %p, i64 16383
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -1139,7 +1139,7 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 16777214
+ %gep = getelementptr inbounds i8, ptr %p, i64 16777214
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -1225,7 +1225,7 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -4096
+ %gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -1311,7 +1311,7 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -8192
+ %gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -1397,7 +1397,7 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i8, ptr %p, i64 -16384
+ %gep = getelementptr inbounds i8, ptr %p, i64 -16384
%load = load i8, ptr %gep, align 4
ret i8 %load
}
@@ -2835,7 +2835,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 1
+ %gep = getelementptr inbounds i8, ptr %p, i64 1
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -2925,7 +2925,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 2047
+ %gep = getelementptr inbounds i8, ptr %p, i64 2047
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3015,7 +3015,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 4095
+ %gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 8191
+ %gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3239,7 +3239,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -2048
+ %gep = getelementptr inbounds i8, ptr %p, i64 -2048
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3351,7 +3351,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -4096
+ %gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3463,7 +3463,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -8192
+ %gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3553,7 +3553,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 4095
+ %gep = getelementptr inbounds i8, ptr %p, i64 4095
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3665,7 +3665,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 8191
+ %gep = getelementptr inbounds i8, ptr %p, i64 8191
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3777,7 +3777,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 16383
+ %gep = getelementptr inbounds i8, ptr %p, i64 16383
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -3889,7 +3889,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -4096
+ %gep = getelementptr inbounds i8, ptr %p, i64 -4096
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -4001,7 +4001,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -8192
+ %gep = getelementptr inbounds i8, ptr %p, i64 -8192
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
@@ -4113,7 +4113,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0
; GFX12-GISEL-NEXT: s_endpgm
- %gep = getelementptr i8, ptr %p, i64 -16384
+ %gep = getelementptr inbounds i8, ptr %p, i64 -16384
%load = load volatile i8, ptr %gep, align 1
store i8 %load, ptr poison
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 42401af..f78168b 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -78,12 +78,14 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fadd_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -142,13 +144,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v4_vs:
@@ -156,13 +161,16 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -332,56 +340,69 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fadd_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v32_vs:
@@ -389,54 +410,70 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -502,15 +539,16 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -645,15 +683,16 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -703,13 +742,15 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -746,17 +787,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fadd_v2_v_lit_lo0:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x3f80000000000000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -792,17 +847,31 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fadd_v2_v_unfoldable_lit:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x400000003f800000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -1085,12 +1154,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] neg_lo:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] neg_lo:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2:
@@ -1159,12 +1230,14 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2:
@@ -1262,12 +1335,14 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fmul_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1326,13 +1401,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v4_vs:
@@ -1340,13 +1418,16 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -1516,56 +1597,69 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fmul_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fmul_v32_vs:
@@ -1573,54 +1667,70 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -1685,15 +1795,16 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fmul_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1828,15 +1939,16 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -1873,17 +1985,31 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
;
-; GFX1250-LABEL: fmul_v2_v_unfoldable_lit:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[0:1], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
-; GFX1250-NEXT: s_endpgm
+; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -2040,12 +2166,14 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; GFX1250-LABEL: fma_v2_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[2:3]
-; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3]
+; GFX1250-NEXT: global_store_b64 v4, v[0:1], s[0:1] scale_offset
; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2104,13 +2232,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-SDAG-NEXT: s_clause 0x1
; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[4:5], v[4:5]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[6:7], v[6:7]
+; GFX1250-SDAG-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v4_vs:
@@ -2118,13 +2249,16 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v8, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[4:5], v[4:5]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[6:7], v[6:7]
+; GFX1250-GISEL-NEXT: global_store_b128 v8, v[0:3], s[6:7] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
@@ -2294,56 +2428,68 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
;
; GFX1250-SDAG-LABEL: fma_v32_vs:
; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
-; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
-; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
-; GFX1250-SDAG-NEXT: s_clause 0x1
-; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35]
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
+; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[12:13], s[12:13]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[14:15], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[10:11], s[10:11]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[16:17], s[16:17]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[40:41], s[40:41]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[38:39], s[38:39]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[48:49], s[48:49]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17]
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[44:45], s[44:45]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[46:47], s[46:47]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[50:51], s[50:51]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[36:37], s[36:37]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[42:43], s[42:43]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[18:19], s[18:19]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[20:21], s[20:21]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[22:23], s[22:23]
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[8:9], s[8:9]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[34:35], v[34:35]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[54:55], v[54:55]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[50:51], v[50:51]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[40:41], v[40:41]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[46:47], v[46:47]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[48:49], v[48:49]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[42:43], v[42:43]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[44:45], v[44:45]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
; GFX1250-SDAG-NEXT: s_clause 0x7
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
-; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35]
+; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v32_vs:
@@ -2351,54 +2497,70 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
-; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[16:17], s[16:17]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], s[18:19], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], s[20:21], s[20:21]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], s[22:23], s[22:23]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], s[24:25], s[24:25]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], s[26:27], s[26:27]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], s[28:29], s[28:29]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], s[30:31], s[30:31]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[44:45], v[44:45]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[46:47], v[46:47]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], s[0:1], s[0:1]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], s[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[16:17], v[16:17], v[48:49], v[48:49]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[18:19], v[18:19], v[50:51], v[50:51]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], s[4:5], s[4:5]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], s[6:7], s[6:7]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[52:53], v[52:53]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[22:23], v[22:23], v[54:55], v[54:55]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], s[8:9], s[8:9]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], s[10:11], s[10:11]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[24:25], v[24:25], v[32:33], v[32:33]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[26:27], v[26:27], v[34:35], v[34:35]
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], s[12:13], s[12:13]
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], s[14:15], s[14:15]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
; GFX1250-GISEL-NEXT: s_clause 0x7
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -2488,17 +2650,19 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fma_v2_v_imm:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0x43480000
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2653,17 +2817,19 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 4.0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -2740,29 +2906,30 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: s_mov_b64 s[4:5], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000)
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], lit64(0x400000003f800000)
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[4:5], s[2:3]
-; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-SDAG-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000)
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], s[2:3], s[4:5]
-; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX1250-GISEL-NEXT: global_store_b64 v6, v[0:1], s[0:1] scale_offset
; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
@@ -3268,20 +3435,22 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX1250-SDAG-NEXT: s_add_f32 s1, s1, 0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v0, v1
; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v3, v0
-; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
bb:
%i12 = fadd <2 x float> zeroinitializer, %arg
@@ -3363,15 +3532,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT: s_add_f32 s6, s1, s3
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], s[2:3], s[6:7] op_sel_hi:[1,0]
-; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[2:3], s[2:3] neg_lo:[0,1] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[0:1], s[2:3] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
@@ -3380,13 +3550,16 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX1250-GISEL-NEXT: s_sub_f32 s0, s0, s2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_3)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v2, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], s[2:3], v[0:1]
-; GFX1250-GISEL-NEXT: v_dual_subrev_f32 v3, s3, v0 :: v_dual_mov_b32 v0, 0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX1250-GISEL-NEXT: s_endpgm
bb:
@@ -3593,7 +3766,9 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1]
; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
new file mode 100644
index 0000000..f934c85
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
@@ -0,0 +1,208 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1250 <%s | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 4
+; CHECK: ; NumVgprs: 2
+; CHECK: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT: - .api: Vulkan
+; CHECK-NEXT: .compute_registers:
+; CHECK-NEXT: .tg_size_en: true
+; CHECK-NEXT: .tgid_x_en: false
+; CHECK-NEXT: .tgid_y_en: false
+; CHECK-NEXT: .tgid_z_en: false
+; CHECK-NEXT: .tidig_comp_cnt: 0x1
+; CHECK-NEXT: .graphics_registers:
+; CHECK-NEXT: .ps_extra_lds_size: 0
+; CHECK-NEXT: .spi_ps_input_addr:
+; CHECK-NEXT: .ancillary_ena: false
+; CHECK-NEXT: .front_face_ena: true
+; CHECK-NEXT: .line_stipple_tex_ena: false
+; CHECK-NEXT: .linear_center_ena: true
+; CHECK-NEXT: .linear_centroid_ena: true
+; CHECK-NEXT: .linear_sample_ena: true
+; CHECK-NEXT: .persp_center_ena: true
+; CHECK-NEXT: .persp_centroid_ena: true
+; CHECK-NEXT: .persp_pull_model_ena: false
+; CHECK-NEXT: .persp_sample_ena: true
+; CHECK-NEXT: .pos_fixed_pt_ena: true
+; CHECK-NEXT: .pos_w_float_ena: false
+; CHECK-NEXT: .pos_x_float_ena: false
+; CHECK-NEXT: .pos_y_float_ena: false
+; CHECK-NEXT: .pos_z_float_ena: false
+; CHECK-NEXT: .sample_coverage_ena: false
+; CHECK-NEXT: .spi_ps_input_ena:
+; CHECK-NEXT: .ancillary_ena: false
+; CHECK-NEXT: .front_face_ena: false
+; CHECK-NEXT: .line_stipple_tex_ena: false
+; CHECK-NEXT: .linear_center_ena: false
+; CHECK-NEXT: .linear_centroid_ena: false
+; CHECK-NEXT: .linear_sample_ena: false
+; CHECK-NEXT: .persp_center_ena: false
+; CHECK-NEXT: .persp_centroid_ena: false
+; CHECK-NEXT: .persp_pull_model_ena: false
+; CHECK-NEXT: .persp_sample_ena: true
+; CHECK-NEXT: .pos_fixed_pt_ena: false
+; CHECK-NEXT: .pos_w_float_ena: false
+; CHECK-NEXT: .pos_x_float_ena: false
+; CHECK-NEXT: .pos_y_float_ena: false
+; CHECK-NEXT: .pos_z_float_ena: false
+; CHECK-NEXT: .sample_coverage_ena: false
+; CHECK-NEXT: .hardware_stages:
+; CHECK-NEXT: .cs:
+; CHECK-NEXT: .checksum_value: 0x9444d7d0
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_cs
+; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
+; CHECK-NEXT: .excp_en: 0
+; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .forward_progress: true
+; GFX11-NEXT: .ieee_mode: false
+; CHECK-NEXT: .image_op: false
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x4
+; CHECK-NEXT: .sgpr_limit: 0x6a
+; CHECK-NEXT: .threadgroup_dimensions:
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: - 0x400
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: .trap_present: false
+; CHECK-NEXT: .user_data_reg_map:
+; CHECK-NEXT: - 0x10000000
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: .user_sgprs: 0x3
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: .vgpr_limit: 0x100
+; CHECK-NEXT: .wavefront_size: 0x20
+; CHECK-NEXT: .wgp_mode: false
+; CHECK-NEXT: .gs:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_gs
+; CHECK-NEXT: .entry_point_symbol: gs_shader
+; CHECK-NEXT: .forward_progress: true
+; CHECK-NEXT: .lds_size: 0x400
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: false
+; CHECK-NEXT: .hs:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_hs
+; CHECK-NEXT: .entry_point_symbol: hs_shader
+; CHECK-NEXT: .forward_progress: true
+; CHECK-NEXT: .lds_size: 0x1000
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: false
+; CHECK-NEXT: .ps:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_ps
+; CHECK-NEXT: .entry_point_symbol: ps_shader
+; CHECK-NEXT: .forward_progress: true
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: false
+; CHECK: .registers: {}
+; CHECK:amdpal.version:
+; CHECK-NEXT: - 0x3
+; CHECK-NEXT: - 0
+; CHECK-NEXT:...
+; CHECK-NEXT: .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+ %i = call i64 @llvm.amdgcn.s.getpc()
+ %i1 = and i64 %i, -4294967296
+ %i2 = zext i32 %arg1 to i64
+ %i3 = or i64 %i1, %i2
+ %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+ %i5 = and i32 %arg2, 1023
+ %i6 = lshr i32 %arg2, 10
+ %i7 = and i32 %i6, 1023
+ %i8 = add nuw nsw i32 %i7, %i5
+ %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+ %.idx = shl nuw nsw i32 %i8, 2
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+ ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+ ret void
+}
+
+@LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() #2 {
+ %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+ store i32 0, ptr addrspace(3) %ptr, align 4
+ ret void
+}
+
+@LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() #2 {
+ %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+ store i32 0, ptr addrspace(3) %ptr, align 4
+ ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index fc36042..89bcfb3 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -58,6 +58,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0
attributes #0 = { nounwind readnone speculatable willreturn }
;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index f54a383..dc631df 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %25
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %25
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %27
; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -61,10 +61,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %24
- ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %24
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %22
- ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %22
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %23
+ ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
+ ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX90A-NEXT: [[COPY2:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
@@ -80,9 +80,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; PEI-GFX90A-NEXT: {{ $}}
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
- ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
+ ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 764a1e1..21455a9 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -188,8 +188,7 @@ body: |
; GCN-LABEL: name: fold_sreg_64_to_sreg_64
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
- ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
- ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]]
%0:sreg_64 = S_MOV_B64 1311768467750121200
%1:sreg_64 = COPY killed %0
SI_RETURN_TO_EPILOG %1
@@ -265,7 +264,7 @@ body: |
; GCN-LABEL: name: fmac_sreg_64_sub0_src0_to_fmamk
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], 2882399984, [[DEF1]], implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_FMAMK_F32_:%[0-9]+]]:vgpr_32 = V_FMAMK_F32 [[DEF]], -1412567312, [[DEF1]], implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAMK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
@@ -319,7 +318,7 @@ body: |
; GCN-LABEL: name: fma_sreg_64_sub0_to_fmaak
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], 2882399984, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_FMAAK_F32_:%[0-9]+]]:vgpr_32 = V_FMAAK_F32 [[DEF]], [[DEF1]], -1412567312, implicit $mode, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_FMAAK_F32_]]
%0:vgpr_32 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
@@ -761,9 +760,106 @@ body: |
bb.0:
; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av
; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
- ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]]
- ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]]
%0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
%1:av_32 = COPY killed %0
SI_RETURN_TO_EPILOG implicit %1
...
+
+---
+name: fold_av_mov_b64_imm_pseudo_inlineimm_to_vgpr
+body: |
+ bb.0:
+ ; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_vgpr
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
+ %0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ %1:vreg_64_align2 = COPY killed %0
+ SI_RETURN_TO_EPILOG implicit %1
+...
+
+---
+name: fold_av_mov_b64_imm_pseudo_inlineimm_to_agpr
+body: |
+ bb.0:
+ ; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_agpr
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_64_align2 = COPY killed [[AV_MOV_]]
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+ %0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ %1:areg_64_align2 = COPY killed %0
+ SI_RETURN_TO_EPILOG implicit %1
+...
+
+---
+name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
+body: |
+ bb.0:
+ ; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
+ ; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
+ %0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
+ %1:av_64_align2 = COPY killed %0
+ SI_RETURN_TO_EPILOG implicit %1
+...
+
+---
+name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+ %0:sreg_64 = S_MOV_B64 64
+ %1:sgpr_lo16 = COPY killed %0.lo16
+ SI_RETURN_TO_EPILOG %1
+
+...
+---
+name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+ %0:sreg_64 = S_MOV_B64 64
+ %1:sgpr_lo16 = COPY killed %0.hi16
+ SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 64
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+ %0:sreg_64 = S_MOV_B64 64
+ $sgpr0_lo16 = COPY killed %0.lo16
+ SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+---
+name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+ ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+ %0:sreg_64 = S_MOV_B64 64
+ $sgpr0_lo16 = COPY killed %0.hi16
+ SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
index 8145a1d7..31ed09b 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir
@@ -85,7 +85,6 @@ body: |
; GCN-NEXT: bb.3:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; GCN-NEXT: S_BRANCH %bb.2
bb.0:
successors: %bb.1
@@ -142,7 +141,6 @@ body: |
; GCN-NEXT: bb.3:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: S_BRANCH %bb.2
bb.0:
successors: %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 458afca..d0d5cc1 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -10,13 +10,13 @@ body: |
; GCN-LABEL: name: bundle_memops
; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit undef $vgpr3_vgpr4, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit undef $vgpr3_vgpr4, implicit $exec {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, implicit $exec
; GCN-NEXT: }
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr5, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 {
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, implicit $exec
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, implicit $exec
; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, implicit $exec
@@ -35,7 +35,7 @@ body: |
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: S_NOP 0
; GCN-NEXT: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
; GCN-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
; GCN-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -44,11 +44,11 @@ body: |
; GCN-NEXT: DS_WRITE_B32_gfx9 $vgpr0, $vgpr3, 4, 0, implicit killed $m0, implicit $exec
; GCN-NEXT: }
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr2_lo16, implicit-def $sgpr2_hi16, implicit-def $sgpr3, implicit-def $sgpr3_lo16, implicit-def $sgpr3_hi16, implicit undef $sgpr0_sgpr1, implicit undef $sgpr10 {
+ ; GCN-NEXT: BUNDLE implicit-def $sgpr2, implicit-def $sgpr3, implicit undef $sgpr0_sgpr1, implicit undef $sgpr10 {
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0
; GCN-NEXT: $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 {
; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, implicit $exec
; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -56,7 +56,7 @@ body: |
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
; GCN-NEXT: }
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GCN-NEXT: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
; GCN-NEXT: }
@@ -68,7 +68,7 @@ body: |
; GCN-NEXT: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0
; GCN-NEXT: $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0
; GCN-NEXT: S_NOP 0
- ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit $vgpr0, implicit $exec, implicit $vgpr1 {
; GCN-NEXT: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec
; GCN-NEXT: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -124,7 +124,7 @@ body: |
; GCN-LABEL: name: bundle_dbg_value_0
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -145,7 +145,7 @@ body: |
; GCN-LABEL: name: bundle_dbg_value_1
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
@@ -170,7 +170,7 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: DBG_VALUE $vgpr1, 0, 0
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $vgpr0, 0, 0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -193,7 +193,7 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: KILL $vgpr1
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -215,7 +215,7 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr2, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: KILL internal $vgpr0
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -239,7 +239,7 @@ body: |
; GCN-LABEL: name: post_bundle_kill
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -258,7 +258,7 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -280,7 +280,7 @@ body: |
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -300,7 +300,7 @@ body: |
; GCN-LABEL: name: post_bundle_multi_kill_0
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -320,7 +320,7 @@ body: |
; GCN-LABEL: name: post_bundle_multi_kill_1
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
; GCN-NEXT: }
@@ -341,7 +341,7 @@ body: |
; GCN-LABEL: name: post_bundle_kill_and_null_reg_dbginfo
; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr0, implicit-def $vgpr1, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 {
; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, implicit $exec
; GCN-NEXT: DBG_VALUE $noreg, $noreg
; GCN-NEXT: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
@@ -351,3 +351,58 @@ body: |
$vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, implicit $exec
KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6
...
+
+# Avoid bundling if a MBB has SCHED_BARRIER
+---
+name: no_sched_barrier_within_bundle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABLE: name: no_sched_barrier_within_bundle
+ ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+ ; GCN-NEXT: renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
+ ; GCN-NEXT: renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+ ; GCN-NEXT: }
+ ; GCN-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ ; GCN-NEXT: SCHED_BARRIER 1924
+ ; GCN-NEXT: renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ ; GCN-NEXT: SCHED_BARRIER 1924
+ ; GCN-NEXT: renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ ; GCN-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+ ; GCN-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+ ; GCN-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12, implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit killed $vgpr2 {
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+ ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+ ; GCN-NEXT: }
+ ; GCN-NEXT: S_ENDPGM 0
+ renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2
+ renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0
+ renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+ renamable $vgpr10 = IMPLICIT_DEF
+ renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+ SCHED_BARRIER 1924
+ renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ SCHED_BARRIER 1924
+ renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10
+ renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+ renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec, implicit killed $vgpr11
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index 6ab8a343..5fea0ae 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vimage
; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
; GFX12-NEXT: $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body: |
; GFX12-LABEL: name: post_bundle_vsample
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr6_vgpr7, implicit-def $vgpr6_vgpr7_vgpr8, implicit-def $vgpr7_vgpr8, implicit-def $vgpr7_vgpr8_vgpr9, implicit-def $vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr10_vgpr11, implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr11_vgpr12, implicit-def $vgpr11_vgpr12_vgpr13, implicit-def $vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
+ ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
; GFX12-NEXT: $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/pr155452.ll b/llvm/test/CodeGen/AMDGPU/pr155452.ll
new file mode 100644
index 0000000..d021b21
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pr155452.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -march=amdgcn -o - | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) {
+; CHECK-LABEL: my_kernel:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; CHECK-NEXT: s_mov_b64 s[4:5], 1
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_ashr_i32 s6, s0, 31
+; CHECK-NEXT: s_abs_i32 s7, s0
+; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s7
+; CHECK-NEXT: s_sub_i32 s0, 0, s7
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, s0, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: s_and_b64 s[0:1], exec, -1
+; CHECK-NEXT: .LBB0_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[8:9], s2, v3, 1
+; CHECK-NEXT: s_mul_i32 s4, s3, s4
+; CHECK-NEXT: s_mul_i32 s5, s2, s5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v4
+; CHECK-NEXT: v_readfirstlane_b32 s4, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, s5, v4
+; CHECK-NEXT: s_ashr_i32 s5, s4, 31
+; CHECK-NEXT: s_abs_i32 s8, s4
+; CHECK-NEXT: s_xor_b32 s5, s5, s6
+; CHECK-NEXT: v_mul_hi_u32 v3, s8, v2
+; CHECK-NEXT: v_readfirstlane_b32 s9, v3
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; CHECK-NEXT: s_mul_i32 s9, s9, s7
+; CHECK-NEXT: s_sub_i32 s8, s8, s9
+; CHECK-NEXT: s_sub_i32 s9, s8, s7
+; CHECK-NEXT: s_cmp_ge_u32 s8, s7
+; CHECK-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT: s_cselect_b32 s8, s9, s8
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; CHECK-NEXT: s_cmp_ge_u32 s8, s7
+; CHECK-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, s5, v3
+; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3
+; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v3
+; CHECK-NEXT: v_or_b32_e32 v3, s4, v3
+; CHECK-NEXT: v_or_b32_e32 v4, v4, v5
+; CHECK-NEXT: flat_load_dwordx2 v[3:4], v[3:4]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_mov_b64 vcc, s[0:1]
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
+; CHECK-NEXT: ; %bb.2: ; %DummyReturnBlock
+; CHECK-NEXT: s_endpgm
+entry:
+ br label %loop
+
+loop: ; preds = %entry, %loop
+ %i = phi i64 [ 1, %entry ], [ 0, %loop ]
+ %mul = mul i64 %foo, %i
+ %add = add i64 %mul, 1
+ %trunc = trunc i64 %add to i32
+ %div = sdiv i32 %trunc, %bar
+ %sext = sext i32 %div to i64
+ %or = or i64 %add, %sext
+ %inttoptr = inttoptr i64 %or to ptr
+ %addrspacecast = addrspacecast ptr %inttoptr to ptr addrspace(1)
+ %val = load double, ptr addrspace(1) %addrspacecast, align 8
+ store double %val, ptr addrspace(1) null, align 8
+ br label %loop
+}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 79b531e..f5e136a 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 {
; GFX942-LABEL: preload_block_count_x:
@@ -30,6 +31,12 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -65,6 +72,12 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s12
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_unused_arg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -101,6 +114,12 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s18
+; GFX1250-NEXT: global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -127,6 +146,14 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: no_inreg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -156,6 +183,16 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: mixed_inreg_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x10
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -172,10 +209,10 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB5_0:
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
@@ -187,11 +224,20 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i64_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
store i64 %load, ptr addrspace(1) %out
@@ -228,6 +274,14 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: incorrect_type_i16_block_count_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] offset:8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
store i16 %load, ptr addrspace(1) %out
@@ -261,6 +315,12 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_y:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
%load = load i32, ptr addrspace(4) %gep
@@ -300,6 +360,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: random_incorrect_offset:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0xa
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
%load = load i32, ptr addrspace(4) %gep
@@ -336,6 +404,12 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v1, s12
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_z:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%load = load i32, ptr addrspace(4) %gep
@@ -376,6 +450,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_x_imparg_align_ptr_i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_co_i32 s0, s6, s0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
%ext = zext i8 %val to i32
@@ -417,6 +500,13 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_xyz:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
%load_x = load i32, ptr addrspace(4) %gep_x
@@ -461,6 +551,14 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s7, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
%load = load i16, ptr addrspace(4) %gep
@@ -499,6 +597,14 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_y:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s7, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
%load = load i16, ptr addrspace(4) %gep
@@ -539,6 +645,14 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_z:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
%load = load i16, ptr addrspace(4) %gep
@@ -587,6 +701,16 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_workgroup_size_xyz:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s7, 16
+; GFX1250-NEXT: s_and_b32 s1, s7, 0xffff
+; GFX1250-NEXT: s_and_b32 s4, s8, 0xffff
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
+; GFX1250-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
%load_x = load i16, ptr addrspace(4) %gep_x
@@ -636,6 +760,14 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_remainder_x:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s8, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
%load = load i16, ptr addrspace(4) %gep
@@ -674,6 +806,14 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_y:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
%load = load i16, ptr addrspace(4) %gep
@@ -712,6 +852,14 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_z:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
%load = load i16, ptr addrspace(4) %gep
@@ -758,6 +906,16 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preloadremainder_xyz:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT: s_lshr_b32 s1, s8, 16
+; GFX1250-NEXT: s_and_b32 s4, s9, 0xffff
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
+; GFX1250-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
%load_x = load i16, ptr addrspace(4) %gep_x
@@ -805,6 +963,14 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: no_free_sgprs_preloadremainder_z:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s15, 16
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[8:9]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
%load = load i16, ptr addrspace(4) %gep
@@ -845,6 +1011,12 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_max_user_sgprs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s12
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
store i32 %load, ptr addrspace(1) %out
@@ -887,6 +1059,15 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s9, 16
+; GFX1250-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
%gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 58f0b96..84aa948 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -13,7 +13,8 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
; ASM-NEXT: .p2align 8
; ASM-NEXT: .LBB0_0:
; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
-; ASM-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; ASM-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; ASM-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; ASM-NEXT: s_endpgm
store ptr %arg, ptr %arg
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index efe4cfa..4d367ef 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
-
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 {
; GFX942-LABEL: ptr1_i8:
@@ -33,6 +33,14 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
@@ -68,6 +76,14 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8_zext_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
@@ -103,6 +119,14 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i16_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
ret void
@@ -136,6 +160,12 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i32_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
}
@@ -172,6 +202,14 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i32_ptr1_i32_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_add_co_i32 s0, s2, s6
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX1250-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
@@ -211,6 +249,16 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i16_i16_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_lshr_b32 s0, s4, 16
+; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_co_i32 s0, s1, s0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
%add = add i32 %ext, %ext1
@@ -246,6 +294,12 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_v2i8_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
}
@@ -289,6 +343,18 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: byref_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -335,6 +401,18 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: byref_staggered_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -390,6 +468,18 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v8i32_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s14
+; GFX1250-NEXT: v_dual_mov_b32 v1, s15 :: v_dual_mov_b32 v2, s16
+; GFX1250-NEXT: v_dual_mov_b32 v3, s17 :: v_dual_mov_b32 v4, s10
+; GFX1250-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v6, s12
+; GFX1250-NEXT: v_mov_b32_e32 v7, s13
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -425,6 +515,15 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v3i16_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -461,6 +560,13 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v3i32_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -497,6 +603,13 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v3f32_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -546,6 +659,19 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v5i8_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s4
+; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: s_or_b32 s0, s1, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b8 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
}
@@ -604,6 +730,24 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v5f64_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[12:13], s[0:1], 0x60
+; GFX1250-NEXT: s_load_b256 s[4:11], s[0:1], 0x40
+; GFX1250-NEXT: v_mov_b32_e32 v10, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX1250-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
+; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX1250-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b64 v10, v[8:9], s[2:3] offset:32
+; GFX1250-NEXT: global_store_b128 v10, v[0:3], s[2:3] offset:16
+; GFX1250-NEXT: global_store_b128 v10, v[4:7], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
}
@@ -665,6 +809,20 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v8i8_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s5
+; GFX1250-NEXT: s_pack_lh_b32_b16 s1, 0, s4
+; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX1250-NEXT: s_or_b32 s1, s4, s1
+; GFX1250-NEXT: s_or_b32 s0, s5, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
}
@@ -679,9 +837,9 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB16_0:
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: i64_kernel_preload_arg:
@@ -692,10 +850,17 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i64_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
}
@@ -710,9 +875,9 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB17_0:
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX90a-LABEL: f64_kernel_preload_arg:
@@ -723,10 +888,17 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v0, 0
+; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: f64_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX1250-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
}
@@ -759,6 +931,12 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: half_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
}
@@ -791,6 +969,12 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: bfloat_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
}
@@ -823,6 +1007,12 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v2bfloat_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
}
@@ -858,6 +1048,15 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s10
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v3bfloat_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_mov_b32_e32 v2, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
}
@@ -894,6 +1093,13 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v6bfloat_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
}
@@ -939,6 +1145,17 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b16 v3, v4, s[2:3]
+; GFX1250-NEXT: global_store_b16 v3, v5, s[10:11] offset:12
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[10:11]
+; GFX1250-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
ret void
@@ -974,6 +1191,14 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i1_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
}
@@ -1012,6 +1237,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fp128_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT: v_mov_b32_e32 v3, s9
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX1250-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
}
@@ -1063,6 +1296,20 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v7i8_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_pack_lh_b32_b16 s0, 0, s4
+; GFX1250-NEXT: s_and_b32 s1, s4, 0xffff
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: s_or_b32 s0, s1, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:6
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3] offset:4
+; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3]
+; GFX1250-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
}
@@ -1103,6 +1350,16 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: v7half_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s9
+; GFX1250-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: v_mov_b32_e32 v1, s7
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v3, v4, s[2:3] offset:12
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[2:3]
+; GFX1250-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
}
@@ -1139,6 +1396,15 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
; GFX90a-NEXT: v_mov_b32_e32 v1, s11
; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i16_i32_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: v_mov_b32_e32 v2, s5
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: global_store_b32 v0, v2, s[6:7]
+; GFX1250-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
ret void
@@ -1181,6 +1447,16 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX1250-NEXT: v_mov_b32_e32 v2, s8
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v3, v4, s[2:3]
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[10:11]
+; GFX1250-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
ret void
@@ -1216,6 +1492,14 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i16_i16_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: global_store_d16_hi_b16 v0, v1, s[6:7]
+; GFX1250-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
ret void
@@ -1261,6 +1545,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b16 v0, v1, s[2:3]
+; GFX1250-NEXT: global_store_d16_hi_b16 v0, v1, s[6:7]
+; GFX1250-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
ret void
@@ -1302,6 +1594,16 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: i32_ptr1_i32_staggered_preload_arg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b96 s[4:6], s[0:1], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_add_co_i32 s0, s2, s6
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX1250-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
ret void
@@ -1336,6 +1638,14 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
+;
+; GFX1250-LABEL: ptr1_i8_trailing_unused:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_and_b32 s0, s4, 0xff
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX1250-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 3ce0947..79910af 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -374,7 +374,7 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: shl_i16_zext_i32:
@@ -412,7 +412,7 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: lshr_i16_zext_i32:
@@ -450,7 +450,7 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: ashr_i16_zext_i32:
@@ -488,7 +488,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: add_u16_zext_i32:
@@ -526,7 +526,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: sub_u16_zext_i32:
@@ -564,7 +564,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: mul_lo_u16_zext_i32:
@@ -602,7 +602,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: min_u16_zext_i32:
@@ -641,7 +641,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: min_i16_zext_i32:
@@ -680,7 +680,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: max_u16_zext_i32:
@@ -719,7 +719,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: max_i16_zext_i32:
@@ -758,7 +758,7 @@ define i32 @zext_fadd_f16(half %x, half %y) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_fadd_f16:
@@ -797,8 +797,10 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) {
; GFX11-TRUE16-LABEL: zext_fma_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_fma_f16:
@@ -838,7 +840,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_div_fixup_f16:
@@ -880,7 +882,7 @@ define i32 @zext_fptrunc_f16(float %x) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_fptrunc_f16:
@@ -924,12 +926,20 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: zext_fptrunc_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call float @llvm.fma.f32(float %x, float %y, float %z)
%fptrunc = fptrunc float %fma to half
%cast = bitcast half %fptrunc to i16
@@ -940,3 +950,5 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
declare half @llvm.fma.f16(half, half, half)
declare float @llvm.fma.f32(float, float, float)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
index ad42748..c1123d7 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-max-regs.ll
@@ -1,9 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE --check-prefix=DEFAULT %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --check-prefix=BASE --check-prefix=MAX16 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=24 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX24
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 -amdgpu-promote-alloca-to-vector-max-regs=32 < %s | FileCheck --check-prefix=BASE %s --check-prefix=MAX32
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds=1 < %s | FileCheck --check-prefix=BASE %s --check-prefix=DEFAULT
define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT: ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX24-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; MAX24-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX24-NEXT: ret void
+;
; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements(
; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -19,12 +51,50 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
+; DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
; DEFAULT-NEXT: ret void
;
-; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+ %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+ %c1 = icmp uge i32 %x, 3
+ %c2 = icmp uge i32 %y, 3
+ %sel1 = select i1 %c1, i32 1, i32 2
+ %sel2 = select i1 %c2, i32 0, i32 %sel1
+ %alloca = alloca [24 x i32], align 16, addrspace(5)
+ call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
+ %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+ %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+ store i32 42, ptr addrspace(5) %gep.0
+ store i32 43, ptr addrspace(5) %gep.1
+ %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ %load = load i32, ptr addrspace(5) %gep
+ store i32 %load, ptr %out
+ ret void
+}
+
+define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [24 x i32], align 16, addrspace(5)
+; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
+; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
+; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
+; MAX16-NEXT: ret void
+;
+; MAX24-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
@@ -36,18 +106,18 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
; MAX24-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
; MAX24-NEXT: ret void
;
-; MAX32-LABEL: define amdgpu_kernel void @i32_24_elements(
-; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; MAX32-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
-; MAX32-NEXT: ret void
+; DEFAULT-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
+; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
+; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; DEFAULT-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison
+; DEFAULT-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
+; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
+; DEFAULT-NEXT: ret void
;
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%y = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -67,18 +137,24 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
ret void
}
-define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
-; BASE-LABEL: define amdgpu_kernel void @i32_24_elements_attrib(
-; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
+; BASE-LABEL: define amdgpu_kernel void @i32_32_elements(
+; BASE-SAME: ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; BASE-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; BASE-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
; BASE-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
; BASE-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
; BASE-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
; BASE-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; BASE-NEXT: [[ALLOCA:%.*]] = freeze <24 x i32> poison
-; BASE-NEXT: [[TMP1:%.*]] = extractelement <24 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0, i32 0, i32 0>, i32 [[SEL2]]
-; BASE-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
+; BASE-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; BASE-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; BASE-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; BASE-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; BASE-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; BASE-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; BASE-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; BASE-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; BASE-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; BASE-NEXT: ret void
;
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,40 +163,40 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %alloca = alloca [24 x i32], align 16, addrspace(5)
- call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 96, i1 false)
- %gep.0 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
- %gep.1 = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 20
+ %alloca = alloca [32 x i32], align 16, addrspace(5)
+ call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
+ %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
+ %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
store i32 42, ptr addrspace(5) %gep.0
store i32 43, ptr addrspace(5) %gep.1
- %gep = getelementptr inbounds [24 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
%load = load i32, ptr addrspace(5) %gep
store i32 %load, ptr %out
ret void
}
-define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
-; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements(
-; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
-; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; DEFAULT-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; DEFAULT-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
-; DEFAULT-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; DEFAULT-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; DEFAULT-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; DEFAULT-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
-; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
-; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
-; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
-; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
-; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
-; DEFAULT-NEXT: ret void
+define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
+; MAX16-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX16-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
+; MAX16-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; MAX16-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; MAX16-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
+; MAX16-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
+; MAX16-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
+; MAX16-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
+; MAX16-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
+; MAX16-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
+; MAX16-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
+; MAX16-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
+; MAX16-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
+; MAX16-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; MAX16-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
+; MAX16-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; MAX16-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
+; MAX16-NEXT: ret void
;
-; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements(
-; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
+; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
+; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
@@ -138,38 +214,6 @@ define amdgpu_kernel void @i32_32_elements(ptr %out) #0 {
; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
; MAX24-NEXT: ret void
;
-; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements(
-; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR0]] {
-; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison
-; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
-; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
-; MAX32-NEXT: ret void
-;
- %x = tail call i32 @llvm.amdgcn.workitem.id.x()
- %y = tail call i32 @llvm.amdgcn.workitem.id.y()
- %c1 = icmp uge i32 %x, 3
- %c2 = icmp uge i32 %y, 3
- %sel1 = select i1 %c1, i32 1, i32 2
- %sel2 = select i1 %c2, i32 0, i32 %sel1
- %alloca = alloca [32 x i32], align 16, addrspace(5)
- call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
- %gep.0 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 0
- %gep.1 = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 30
- store i32 42, ptr addrspace(5) %gep.0
- store i32 43, ptr addrspace(5) %gep.1
- %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %sel2
- %load = load i32, ptr addrspace(5) %gep
- store i32 %load, ptr %out
- ret void
-}
-
-define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
; DEFAULT-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
; DEFAULT-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
; DEFAULT-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -183,38 +227,6 @@ define amdgpu_kernel void @i32_32_elements_attrib(ptr %out) #2 {
; DEFAULT-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
; DEFAULT-NEXT: ret void
;
-; MAX24-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
-; MAX24-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
-; MAX24-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX24-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX24-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX24-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX24-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX24-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX24-NEXT: [[ALLOCA:%.*]] = alloca [32 x i32], align 16, addrspace(5)
-; MAX24-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 128, i1 false)
-; MAX24-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
-; MAX24-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 30
-; MAX24-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; MAX24-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4
-; MAX24-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; MAX24-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
-; MAX24-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4
-; MAX24-NEXT: ret void
-;
-; MAX32-LABEL: define amdgpu_kernel void @i32_32_elements_attrib(
-; MAX32-SAME: ptr [[OUT:%.*]]) #[[ATTR2:[0-9]+]] {
-; MAX32-NEXT: [[X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; MAX32-NEXT: [[Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
-; MAX32-NEXT: [[C1:%.*]] = icmp uge i32 [[X]], 3
-; MAX32-NEXT: [[C2:%.*]] = icmp uge i32 [[Y]], 3
-; MAX32-NEXT: [[SEL1:%.*]] = select i1 [[C1]], i32 1, i32 2
-; MAX32-NEXT: [[SEL2:%.*]] = select i1 [[C2]], i32 0, i32 [[SEL1]]
-; MAX32-NEXT: [[ALLOCA:%.*]] = freeze <32 x i32> poison
-; MAX32-NEXT: [[TMP1:%.*]] = extractelement <32 x i32> <i32 42, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 43, i32 0>, i32 [[SEL2]]
-; MAX32-NEXT: store i32 [[TMP1]], ptr [[OUT]], align 4
-; MAX32-NEXT: ret void
-;
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%y = tail call i32 @llvm.amdgcn.workitem.id.y()
%c1 = icmp uge i32 %x, 3
@@ -237,6 +249,6 @@ declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.workitem.id.y()
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, i1 immarg)
-attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="16" }
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="24" }
-attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" "amdgpu-promote-alloca-to-vector-max-regs"="32" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index aabd5df..ec04c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-max-regs=16 < %s | FileCheck --enable-var-scope %s
declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index 13605a1..606cd65 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -204,11 +204,11 @@ attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index b87d266..02c7647 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -279,7 +279,7 @@ define amdgpu_kernel void @kernel_3_6() #12 {
; 3,6 -> 6,9
define internal void @refine_upper_func_3_6() #13 {
; CHECK-LABEL: define internal void @refine_upper_func_3_6
-; CHECK-SAME: () #[[ATTR9]] {
+; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -288,7 +288,7 @@ define internal void @refine_upper_func_3_6() #13 {
; 4,8 -> 6,8
define internal void @refine_lower_func_4_8() #14 {
; CHECK-LABEL: define internal void @refine_lower_func_4_8
-; CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: ret void
;
@@ -298,7 +298,7 @@ define internal void @refine_lower_func_4_8() #14 {
define amdgpu_kernel void @kernel_foo_6_8() #15 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_foo_6_8
-; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: call void @refine_lower_func_4_8()
; CHECK-NEXT: call void @func_9_10_a()
@@ -313,7 +313,7 @@ define amdgpu_kernel void @kernel_foo_6_8() #15 {
; 5,5 -> 5,5
define internal void @func_5_5() #16 {
; CHECK-LABEL: define internal void @func_5_5
-; CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -322,7 +322,7 @@ define internal void @func_5_5() #16 {
; 5,8 -> 8,8
define internal void @func_5_8() #17 {
; CHECK-LABEL: define internal void @func_5_8
-; CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -331,7 +331,7 @@ define internal void @func_5_8() #17 {
; 9,10 -> 9,10
define internal void @func_9_10_a() #18 {
; CHECK-LABEL: define internal void @func_9_10_a
-; CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
@@ -340,7 +340,7 @@ define internal void @func_9_10_a() #18 {
; 9,10 -> 9,9
define internal void @func_9_10_b() #18 {
; CHECK-LABEL: define internal void @func_9_10_b
-; CHECK-SAME: () #[[ATTR18]] {
+; CHECK-SAME: () #[[ATTR19]] {
; CHECK-NEXT: ret void
;
ret void
@@ -348,7 +348,7 @@ define internal void @func_9_10_b() #18 {
define amdgpu_kernel void @kernel_bar_8_9() #19 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_bar_8_9
-; CHECK-SAME: () #[[ATTR19:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR20:[0-9]+]] {
; CHECK-NEXT: call void @refine_upper_func_3_6()
; CHECK-NEXT: call void @func_5_5()
; CHECK-NEXT: call void @func_9_10_b()
@@ -408,15 +408,16 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index 013b68a..99e5d00 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
-;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
; ;CHECK: NumVgprs: 4
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 68ef30a9..4db232c 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -103,17 +103,16 @@ define void @baseptr_null(i64 %offset, i8 %v) {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
; GFX942-LABEL: llvm_amdgcn_queue_ptr:
; GFX942: ; %bb.0:
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
-; GFX942-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
-; GFX942-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
-; GFX942-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: global_load_ubyte v1, v0, s[2:3] sc0 sc1
+; GFX942-NEXT: global_load_ubyte v1, v0, s[4:5] offset:8 sc0 sc1
+; GFX942-NEXT: global_load_ubyte v1, v0, s[0:1] sc0 sc1
; GFX942-NEXT: ; kill: killed $sgpr0_sgpr1
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-NEXT: ; kill: killed $sgpr2_sgpr3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_endpgm
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 131c5f3..f67cbe3 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -10,6 +10,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GETREG,GETREG-GISEL -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
declare i64 @llvm.readcyclecounter() #0
@@ -21,6 +23,7 @@ declare i64 @llvm.readcyclecounter() #0
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
; GFX12: s_cselect_b32 {{s[0-9]+}}, [[LO1]], 0
+; GFX1250: s_get_shader_cycles_u64 s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: kmcnt
; MEMTIME: store_dwordx2
; SIVI-NOT: kmcnt
@@ -53,6 +56,7 @@ define amdgpu_kernel void @test_readcyclecounter(ptr addrspace(1) %out) #0 {
; GFX12: s_getreg_b32 [[HI1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
; GFX12: s_getreg_b32 [[LO1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_LO)
; GFX12: s_getreg_b32 [[HI2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES_HI)
+; GFX1250: s_get_shader_cycles_u64 s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_load_{{dword|b32|b64}}
; GETREG-DAG: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
; GFX12: s_cmp_eq_u32 [[HI1]], [[HI2]]
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index 6e9e4e4..2b07fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index fef7332..31bd50a 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -45,13 +45,13 @@ body: |
INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
%14:vgpr_32 = COPY killed $agpr0
- INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11
+ INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27394058 /* regdef:VReg_512 */, def %7, 13697034 /* regdef:VReg_256 */, def %8, 6225930 /* regdef:VReg_128 */, def %9, 4915210 /* regdef:VReg_96 */, def %10, 4915210 /* regdef:VReg_96 */, def %11
INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10
- INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27394057 /* reguse:VReg_512 */, %7
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13697033 /* reguse:VReg_256 */, %8
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, %9
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %10
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %11
$agpr1 = COPY %14
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir
new file mode 100644
index 0000000..038e195
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-64-bit-only-regs.mir
@@ -0,0 +1,57 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=register-coalescer -show-mc-encoding -o - %s | FileCheck %s
+
+# FIXME: These SRC_*_HI registers do not exist, although defined in the register file
+# and happily used by the coalescer. The resulting encoding is in fact belong
+# to the 64-bit register and corresponding *_LO 32-bit part of it.
+
+# CHECK-LABEL: src_private_base:
+# CHECK: s_subb_u32 s0, SRC_PRIVATE_BASE_HI, s1 ; encoding: [0xed,0x01,0x80,0x82]
+---
+name: src_private_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:sreg_64 = COPY $src_private_base
+ %1:sreg_64 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
+...
+
+# CHECK-LABEL: src_private_limit:
+# CHECK: s_subb_u32 s0, SRC_PRIVATE_LIMIT_HI, s1 ; encoding: [0xee,0x01,0x80,0x82]
+---
+name: src_private_limit
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:sreg_64 = COPY $src_private_limit
+ %1:sreg_64 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
+...
+
+# CHECK-LABEL: src_shared_base:
+# CHECK: s_subb_u32 s0, SRC_SHARED_BASE_HI, s1 ; encoding: [0xeb,0x01,0x80,0x82]
+---
+name: src_shared_base
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:sreg_64 = COPY $src_shared_base
+ %1:sreg_64 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
+...
+
+# CHECK-LABEL: src_shared_limit:
+# CHECK: s_subb_u32 s0, SRC_SHARED_LIMIT_HI, s1 ; encoding: [0xec,0x01,0x80,0x82]
+---
+name: src_shared_limit
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:sreg_64 = COPY $src_shared_limit
+ %1:sreg_64 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %2:sreg_32 = S_SUBB_U32 killed %0.sub1:sreg_64, %1.sub1:sreg_64, implicit-def dead $scc, implicit killed $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index f0c8fed..bfc310a 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -248,27 +248,15 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v12
@@ -298,8 +286,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
@@ -308,20 +294,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
@@ -335,8 +315,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v7, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v12, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v8, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14
@@ -345,20 +323,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5]
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5
@@ -428,7 +400,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8
; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8
; GFX9-O0-NEXT: s_mov_b32 s12, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr14
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10
@@ -437,7 +408,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9
; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr14
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6
@@ -456,8 +426,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7]
@@ -466,7 +434,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr16
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
@@ -475,7 +442,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13
; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4
@@ -493,8 +459,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
@@ -509,15 +473,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -558,8 +518,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -567,8 +525,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
@@ -803,8 +759,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
@@ -837,12 +791,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
@@ -860,12 +810,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
@@ -988,8 +934,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
@@ -1002,8 +946,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b32 s8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
@@ -1021,12 +963,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
@@ -1100,12 +1038,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
@@ -1148,8 +1082,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
@@ -1159,8 +1091,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
@@ -1239,7 +1169,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add3_u32 v8, v0, v5, v8
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v0
@@ -1247,7 +1176,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s5, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v0
@@ -1271,14 +1199,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add3_u32 v8, v8, v9, v14
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14
; GFX9-O0-NEXT: v_lshlrev_b64 v[8:9], s4, v[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s5
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v14
@@ -1299,7 +1225,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v1, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s5
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8
@@ -1307,7 +1232,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s6
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v9
@@ -1321,7 +1245,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v8
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v5, v2, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5
@@ -1329,7 +1252,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v18, s6
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v18
@@ -1343,7 +1265,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v0, v2, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v15
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2
@@ -1366,7 +1287,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v2
; GFX9-O0-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v0, v1, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v22
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s5
; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
@@ -1374,7 +1294,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v23
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6
; GFX9-O0-NEXT: ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v23, v5
@@ -1423,7 +1342,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2
@@ -1446,12 +1364,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
@@ -1478,12 +1392,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
@@ -1714,26 +1624,14 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -1804,7 +1702,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6
; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6
; GFX9-O0-NEXT: s_mov_b32 s8, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
@@ -1813,7 +1710,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4
@@ -1834,15 +1730,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4
; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1
; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
@@ -1851,7 +1744,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9
; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3
; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4
@@ -1871,8 +1763,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
@@ -1889,15 +1779,11 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11
; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6
; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1935,16 +1821,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr12
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
-; GFX9-O0-NEXT: ; implicit-def: $sgpr8
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
@@ -2179,8 +2061,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v7
; GFX9-O0-NEXT: v_ashrrev_i64 v[13:14], s4, v[11:12]
@@ -2213,12 +2093,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8
@@ -2236,12 +2112,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v16
@@ -2364,8 +2236,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5
; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], v4, v[14:15]
@@ -2378,8 +2248,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b32 s8, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v12
@@ -2397,12 +2265,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v14, s6
; GFX9-O0-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v13
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v13, s5
@@ -2476,12 +2340,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
@@ -2524,8 +2384,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr10
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
; GFX9-O0-NEXT: v_lshlrev_b64 v[6:7], v2, v[6:7]
@@ -2535,8 +2393,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v3, v6, s[4:5]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
@@ -2611,7 +2467,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v4
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4
@@ -2619,7 +2474,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s5, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2
@@ -2643,14 +2497,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_add3_u32 v2, v2, v3, v10
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s6
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[2:3]
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v3
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v10
@@ -2671,7 +2523,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v13, v2
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s5
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
@@ -2679,7 +2530,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v11, s6
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v11
@@ -2693,7 +2543,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v10
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v7, v6, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v7, s5
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
@@ -2701,7 +2550,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v16, s6
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v16
@@ -2715,7 +2563,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v7
; GFX9-O0-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v4, v6, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v16, v11
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v6, s5
; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6
@@ -2738,7 +2585,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v17, v14
; GFX9-O0-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v4, v5, 0
; GFX9-O0-NEXT: v_mov_b32_e32 v18, v14
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
; GFX9-O0-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v19, v4
@@ -2746,7 +2592,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v14, v15
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: ; implicit-def: $sgpr7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr7
; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6
; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v15, v5
@@ -2795,7 +2640,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_lshlrev_b64 v[5:6], s4, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v4, s5
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4
@@ -2818,12 +2662,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
@@ -2864,16 +2704,10 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
@@ -2893,8 +2727,6 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_addc_co_u32_e32 v8, vcc, v3, v2, vcc
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: v_addc_co_u32_e32 v2, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
@@ -2911,8 +2743,6 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
@@ -2921,12 +2751,8 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
; GFX9-O0-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
@@ -2957,17 +2783,11 @@ define i128 @v_urem_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr1 killed $exec
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: s_mov_b32 s6, 1
; GFX9-O0-NEXT: s_mov_b32 s4, -1
; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/remat-vop.mir b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
index 4f6ea44..23cf6f0 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-vop.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-vop.mir
@@ -278,16 +278,16 @@ machineFunctionInfo:
body: |
bb.0:
; GCN-LABEL: name: test_remat_v_cvt_i32_f64_e64_undef
- ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
- ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64, 0, 0, implicit $exec, implicit $mode
+ ; GCN: [[V_CVT_I32_F64_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+ ; GCN-NEXT: [[V_CVT_I32_F64_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+ ; GCN-NEXT: [[V_CVT_I32_F64_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %1:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_]]
; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_1]]
; GCN-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e64_2]]
; GCN-NEXT: S_ENDPGM 0
- %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode
- %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode
- %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64, 0, 0, implicit $exec, implicit $mode
+ %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+ %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
+ %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e64 0, undef %0:vreg_64_align2, 0, 0, implicit $exec, implicit $mode
S_NOP 0, implicit %1
S_NOP 0, implicit %2
S_NOP 0, implicit %3
diff --git a/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll
new file mode 100644
index 0000000..03dbfdc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remove-incompatible-wave64-feature.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1250 %s
+; RUN: FileCheck --check-prefix=WARN-GFX1250 %s < %t
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 < %s
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
+; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 < %s
+
+; WARN-GFX1250: removing function 'needs_wavefrontsize64': +wavefrontsize64 is not supported on the current target
+; WARN-GFX1250-NOT: not supported
+
+define void @needs_wavefrontsize64(ptr %out) #0 {
+; GFX1250-NOT: @needs_wavefrontsize64
+; GFX1200: define void @needs_wavefrontsize64(
+ %1 = tail call i64 @llvm.read_register.i64(metadata !0)
+ %2 = tail call i64 @llvm.ctpop.i64(i64 %1)
+ store i64 %2, ptr %out, align 4
+ ret void
+}
+
+define void @caller(ptr %out) {
+ ; GFX1250: call void null(
+ ; GFX1200: call void @needs_wavefrontsize64(
+ call void @needs_wavefrontsize64(ptr %out)
+ ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata)
+declare i64 @llvm.ctpop.i64(i64)
+
+!0 = !{!"exec"}
+
+attributes #0 = { "target-features"="+wavefrontsize64" }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index ed4e691..fe643ff 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,12 +191,12 @@ define amdgpu_kernel void @kernel_lds_recursion() {
!1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" "amdgpu-lds-size"="4" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
new file mode 100644
index 0000000..1456c2c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -0,0 +1,261 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=greedy,amdgpu-rewrite-agpr-copy-mfma -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: test_rewrite_mfma_copy_from_agpr_physreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $agpr0_agpr1
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_physreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $agpr0_agpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY $agpr0_agpr1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = COPY $agpr0_agpr1
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %4, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_copy_from_agpr_unrewritable_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_unrewritable_use
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_128_align2 = COPY %3
+ %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, %5
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_copy_from_agpr_src2_subreg_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_src2_subreg_use
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_128_align2 = COPY %3
+ %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_copy_from_agpr_vdst_subreg_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_vdst_subreg_use
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_128_align2 = COPY %3
+ %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# A-to-V copy is performed subregister at a time instead.
+---
+name: test_rewrite_mfma_copy_from_agpr_split_copy
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_split_copy
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]].sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ undef %4.sub0:vreg_64_align2 = COPY %3.sub0
+ %4.sub1:vreg_64_align2 = COPY %3.sub1
+ %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_copy_from_agpr_copyback
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_copyback
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY4]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = COPY %3
+ %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
+ %6:areg_64_align2 = COPY %5
+ GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+# There is a read of the copy from AGPR to VGPR in the dst operand of the MFMA.
+---
+name: test_rewrite_mfma_copy_from_agpr_vdst_subreg_use_imm_src2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_vdst_subreg_use_imm_src2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_128_align2 = COPY %3
+ %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# Degenerate case. Copy from AGPR to VGPR is dead undef subreg def
+---
+name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_undef_vdst_subreg_use_imm_src2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_128_align2 = COPY %3
+ undef %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX4 %0, %4, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# Degenerate case. Copy from AGPR to VGPR is dead, but same register
+# is redefined as whole register.
+---
+name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_from_agpr_to_vdst_def_imm_src2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: dead [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:areg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = COPY %3
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, 0, 0, 0, 0, implicit $mode, implicit $exec
+ GLOBAL_STORE_DWORDX2 %0, %4, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
new file mode 100644
index 0000000..cefcd7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, i1 %cond) #0 {
+; CHECK-LABEL: test_rewrite_mfma_copy_to_agpr_phi:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dword s6, s[4:5], 0x10
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_bitcmp0_b32 s6, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %else
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
+; CHECK-NEXT: v_mov_b32_e32 v34, 4.0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
+; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
+; CHECK-NEXT: s_cbranch_execz .LBB0_3
+; CHECK-NEXT: s_branch .LBB0_4
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+; CHECK-NEXT: .LBB0_3: ; %if
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v32, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v32, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v32, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v32, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v32, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
+; CHECK-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
+; CHECK-NEXT: .LBB0_4: ; %endif
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ br i1 %cond, label %if, label %else
+
+if:
+ %gep.0 = getelementptr <32 x float>, ptr addrspace(1) %arg0, i32 %id
+ %in.0 = load <32 x float>, ptr addrspace(1) %gep.0, align 128
+ %mai.0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %in.0, i32 0, i32 0, i32 0)
+ br label %endif
+
+else:
+ %gep.1 = getelementptr <32 x float>, ptr addrspace(1) %arg1, i32 %id
+ %in.1 = load <32 x float>, ptr addrspace(1) %gep.1, align 128
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
+ br label %endif
+
+endif:
+ %phi = phi <32 x float> [ %mai.0, %if ], [ %mai.1, %else ]
+ call void asm sideeffect "; use $0", "a"(<32 x float> %phi)
+ ret void
+}
+
+define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, i32 %n) #0 {
+; CHECK-LABEL: test_rewrite_mfma_copy_to_agpr_phi_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x10
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v64, 4.0
+; CHECK-NEXT: v_mov_b32_e32 v65, 2.0
+; CHECK-NEXT: .LBB1_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31]
+; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29]
+; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27]
+; CHECK-NEXT: v_mov_b64_e32 v[56:57], v[24:25]
+; CHECK-NEXT: v_mov_b64_e32 v[54:55], v[22:23]
+; CHECK-NEXT: v_mov_b64_e32 v[52:53], v[20:21]
+; CHECK-NEXT: v_mov_b64_e32 v[50:51], v[18:19]
+; CHECK-NEXT: v_mov_b64_e32 v[48:49], v[16:17]
+; CHECK-NEXT: v_mov_b64_e32 v[46:47], v[14:15]
+; CHECK-NEXT: v_mov_b64_e32 v[44:45], v[12:13]
+; CHECK-NEXT: v_mov_b64_e32 v[42:43], v[10:11]
+; CHECK-NEXT: v_mov_b64_e32 v[40:41], v[8:9]
+; CHECK-NEXT: v_mov_b64_e32 v[38:39], v[6:7]
+; CHECK-NEXT: v_mov_b64_e32 v[36:37], v[4:5]
+; CHECK-NEXT: v_mov_b64_e32 v[34:35], v[2:3]
+; CHECK-NEXT: v_mov_b64_e32 v[32:33], v[0:1]
+; CHECK-NEXT: s_add_i32 s1, s1, 1
+; CHECK-NEXT: s_cmp_lt_u32 s1, s0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v65, v64, v[32:63]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v65, v64, v[0:31]
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_1
+; CHECK-NEXT: ; %bb.2: ; %endif
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v32
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v33
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v34
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v35
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v36
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v37
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v38
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v39
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v40
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v41
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v42
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v43
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v44
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v45
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v46
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v47
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v48
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v49
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v50
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v51
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v52
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v53
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v54
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v55
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v56
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v57
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v58
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v59
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v60
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v61
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v62
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v63
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.0 = getelementptr <32 x float>, ptr addrspace(1) %arg0, i32 %id
+ %in.0 = load <32 x float>, ptr addrspace(1) %gep.0, align 128
+ br label %loop
+
+loop:
+ %i.phi = phi i32 [ 0, %entry ], [ %i.inc, %loop ]
+ %phi = phi <32 x float> [ %in.0, %entry ], [ %mai.1, %loop ]
+ %mai.0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %mai.0, i32 0, i32 0, i32 0)
+ %i.inc = add i32 %i.phi, 1
+ %loop.cond = icmp ult i32 %i.inc, %n
+ br i1 %loop.cond, label %loop, label %endif
+
+endif:
+ call void asm sideeffect "; use $0", "a"(<32 x float> %phi)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" }
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
new file mode 100644
index 0000000..61a91b8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -0,0 +1,226 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=greedy,amdgpu-rewrite-agpr-copy-mfma -o - %s | FileCheck %s
+
+# V-to-A copy is a subregister insert
+---
+name: test_rewrite_mfma_copy_subreg_insert
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_insert
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0_sub1:areg_128_align2 = COPY %4
+ %5.sub2_sub3 = IMPLICIT_DEF
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %5
+ GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister extract
+---
+name: test_rewrite_mfma_copy_subreg_extract
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_extract
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %5:agpr_32 = COPY %4.sub0
+ GLOBAL_STORE_DWORD %0, %5, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister-to-subregister copy
+---
+name: test_rewrite_mfma_copy_subreg_insert_extract_same_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_insert_extract_same_subreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0:areg_64_align2 = COPY %4.sub0
+ GLOBAL_STORE_DWORDX2 %0, %5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister-to-subregister copy
+---
+name: test_rewrite_mfma_copy_subreg_insert_extract_different_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_insert_extract_different_subreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0:areg_64_align2 = COPY %4.sub1
+ GLOBAL_STORE_DWORDX2 %0, %5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister extract, from a subregister def
+---
+name: test_rewrite_mfma_copy_subreg_extract_from_subreg_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_extract_from_subreg_def
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: undef [[DEF:%[0-9]+]].sub0:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ undef %4.sub0 = IMPLICIT_DEF
+ %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %5:agpr_32 = COPY %4.sub0
+ GLOBAL_STORE_DWORD %0, %5, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister insert from a subregister_def
+---
+name: test_rewrite_mfma_copy_subreg_insert_from_subreg_def_tuple
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_insert_from_subreg_def_tuple
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
+ %5.sub2_sub3 = IMPLICIT_DEF
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %5
+ GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# V-to-A copy is a subregister insert of a subregister from a
+# subregister_def
+---
+name: test_rewrite_mfma_copy_subreg_insert_from_subreg_def_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_copy_subreg_insert_from_subreg_def_subreg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub1:areg_128_align2 = COPY %4.sub2
+ %5.sub2_sub3 = IMPLICIT_DEF
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %5
+ GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
new file mode 100644
index 0000000..a472097
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -0,0 +1,333 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=greedy,amdgpu-rewrite-agpr-copy-mfma -o - %s | FileCheck %s
+
+---
+name: test_rewrite_mfma_src2_is_subreg_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_0
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0_sub1:areg_128_align2 = COPY %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %5
+ GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_src2_is_subreg_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_1
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0_sub1:areg_128_align2 = COPY %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %5
+ GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_src2_is_subreg_chain_mfma_full_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_chain_mfma_full_def
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ undef %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use:vreg_64_align2 = COPY %4.sub0_sub1
+ %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %6:areg_64_align2 = COPY %5
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+ GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:AReg_64_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use:vreg_64_align2 = COPY %4
+ undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
+ %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %7:areg_64_align2 = COPY %6
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:AReg_64_Align2 */, %7
+ GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+
+...
+
+---
+name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def_copy_subreg_insert
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def_copy_subreg_insert
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use:vreg_64_align2 = COPY %5.sub0_sub1
+ %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ undef %8.sub0_sub1:areg_128_align2 = COPY %6
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+ GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+
+...
+
+---
+name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def_copy_subreg_extract
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_is_subreg_chain_mfma_sub_def_copy_subreg_extract
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:areg_64_align2 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+ %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use0:vreg_64_align2 = COPY %4.sub0
+ undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use1:vreg_64_align2 = COPY %5.sub0_sub1
+ %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %8:agpr_32 = COPY %6
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, %8:agpr_32
+ GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+ SI_RETURN
+
+...
+
+# Multiple MFMAs in the chain use the same register with different
+# subregisters.
+---
+name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_full_copy_use
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_full_copy_use
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
+ ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %4.sub0_sub1 = IMPLICIT_DEF
+ %other_use0:vreg_64_align2 = COPY %4.sub0_sub1
+ %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
+ %other_use2:vreg_64 = COPY %4.sub1_sub2
+ %6:areg_128_align2 = COPY %4
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+ GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+ SI_RETURN
+...
+
+# Multiple MFMAs in the chain use the same register with different
+# subregisters.
+---
+name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_extract_use_0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_extract_use_0
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
+ ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %4.sub0_sub1 = IMPLICIT_DEF
+ %other_use0:vreg_64_align2 = COPY %4.sub0_sub1
+ %4.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
+ %other_use2:vreg_64 = COPY %4.sub1_sub2
+ %6:areg_64 = COPY %4.sub1_sub2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, %6:areg_64
+ GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
+
+---
+name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_extract_use_1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+ ; CHECK-LABEL: name: test_rewrite_mfma_src2_chain_different_subregs_same_reg_extract_use_1
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr4_vgpr5
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub0_sub1:vreg_128_align2 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: dead %other_use0:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub0_sub1
+ ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2_sub3
+ ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY3]]
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr4_vgpr5
+ %1:av_64_align2 = COPY $vgpr0_vgpr1
+ %2:av_64_align2 = COPY $vgpr2_vgpr3
+ %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+ undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+ %4.sub0_sub1 = IMPLICIT_DEF
+ %3.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
+ %other_use0:vreg_64_align2 = COPY %4.sub0_sub1
+ %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
+ %other_use2:vreg_64 = COPY %4.sub1_sub2
+ %6:areg_64 = COPY %4.sub1_sub2
+ INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:AReg_64 */, %6:areg_64
+ GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+ SI_RETURN
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 0c6339e..81613f6 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mcpu=gfx90a < %s | FileCheck %s
+; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
target triple = "amdgcn-amd-amdhsa"
@@ -7,7 +7,10 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
@@ -18,126 +21,67 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp
; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, 1.0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, 2.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
-; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v4
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v6
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v7
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v8
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v9
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v10
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v11
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v12
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v13
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v14
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v15
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v16
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v17
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v18
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v19
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v20
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v21
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v22
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v23
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v24
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v25
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v26
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v27
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v28
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v29
-; CHECK-NEXT: v_accvgpr_write_b32 a30, v30
-; CHECK-NEXT: v_accvgpr_write_b32 a31, v31
-; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
-; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
-; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
-; CHECK-NEXT: s_nop 7
-; CHECK-NEXT: s_nop 7
-; CHECK-NEXT: s_nop 2
-; CHECK-NEXT: v_accvgpr_read_b32 v4, a59
-; CHECK-NEXT: v_accvgpr_read_b32 v5, a58
-; CHECK-NEXT: v_accvgpr_read_b32 v6, a57
-; CHECK-NEXT: v_accvgpr_read_b32 v7, a56
-; CHECK-NEXT: v_accvgpr_read_b32 v8, a55
-; CHECK-NEXT: v_accvgpr_read_b32 v9, a54
-; CHECK-NEXT: v_accvgpr_read_b32 v10, a53
-; CHECK-NEXT: v_accvgpr_read_b32 v11, a52
-; CHECK-NEXT: v_accvgpr_read_b32 v12, a51
-; CHECK-NEXT: v_accvgpr_read_b32 v13, a50
-; CHECK-NEXT: v_accvgpr_read_b32 v14, a49
-; CHECK-NEXT: v_accvgpr_read_b32 v15, a48
-; CHECK-NEXT: v_accvgpr_read_b32 v16, a47
-; CHECK-NEXT: v_accvgpr_read_b32 v17, a46
-; CHECK-NEXT: v_accvgpr_read_b32 v18, a45
-; CHECK-NEXT: v_accvgpr_read_b32 v19, a44
-; CHECK-NEXT: v_accvgpr_read_b32 v20, a43
-; CHECK-NEXT: v_accvgpr_read_b32 v21, a42
-; CHECK-NEXT: v_accvgpr_read_b32 v22, a41
-; CHECK-NEXT: v_accvgpr_read_b32 v23, a40
-; CHECK-NEXT: v_accvgpr_read_b32 v24, a39
-; CHECK-NEXT: v_accvgpr_read_b32 v25, a38
-; CHECK-NEXT: v_accvgpr_read_b32 v26, a37
-; CHECK-NEXT: v_accvgpr_read_b32 v27, a36
-; CHECK-NEXT: v_accvgpr_read_b32 v28, a35
-; CHECK-NEXT: v_accvgpr_read_b32 v29, a34
-; CHECK-NEXT: v_accvgpr_mov_b32 a2, a32
-; CHECK-NEXT: v_accvgpr_mov_b32 a3, a33
-; CHECK-NEXT: v_accvgpr_write_b32 a4, v29
-; CHECK-NEXT: v_accvgpr_write_b32 a5, v28
-; CHECK-NEXT: v_accvgpr_write_b32 a6, v27
-; CHECK-NEXT: v_accvgpr_write_b32 a7, v26
-; CHECK-NEXT: v_accvgpr_write_b32 a8, v25
-; CHECK-NEXT: v_accvgpr_write_b32 a9, v24
-; CHECK-NEXT: v_accvgpr_write_b32 a10, v23
-; CHECK-NEXT: v_accvgpr_write_b32 a11, v22
-; CHECK-NEXT: v_accvgpr_write_b32 a12, v21
-; CHECK-NEXT: v_accvgpr_write_b32 a13, v20
-; CHECK-NEXT: v_accvgpr_write_b32 a14, v19
-; CHECK-NEXT: v_accvgpr_write_b32 a15, v18
-; CHECK-NEXT: v_accvgpr_write_b32 a16, v17
-; CHECK-NEXT: v_accvgpr_write_b32 a17, v16
-; CHECK-NEXT: v_accvgpr_write_b32 a18, v15
-; CHECK-NEXT: v_accvgpr_write_b32 a19, v14
-; CHECK-NEXT: v_accvgpr_write_b32 a20, v13
-; CHECK-NEXT: v_accvgpr_write_b32 a21, v12
-; CHECK-NEXT: v_accvgpr_write_b32 a22, v11
-; CHECK-NEXT: v_accvgpr_write_b32 a23, v10
-; CHECK-NEXT: v_accvgpr_write_b32 a24, v9
-; CHECK-NEXT: v_accvgpr_write_b32 a25, v8
-; CHECK-NEXT: v_accvgpr_write_b32 a26, v7
-; CHECK-NEXT: v_accvgpr_write_b32 a27, v6
-; CHECK-NEXT: v_accvgpr_write_b32 a28, v5
-; CHECK-NEXT: v_accvgpr_write_b32 a29, v4
-; CHECK-NEXT: v_accvgpr_mov_b32 a30, a60
-; CHECK-NEXT: v_accvgpr_mov_b32 a31, a61
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31]
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v2, v32
+; CHECK-NEXT: v_mov_b32_e32 v3, v33
+; CHECK-NEXT: v_mov_b32_e32 v4, v34
+; CHECK-NEXT: v_mov_b32_e32 v5, v35
+; CHECK-NEXT: v_mov_b32_e32 v6, v36
+; CHECK-NEXT: v_mov_b32_e32 v7, v37
+; CHECK-NEXT: v_mov_b32_e32 v8, v38
+; CHECK-NEXT: v_mov_b32_e32 v9, v39
+; CHECK-NEXT: v_mov_b32_e32 v10, v40
+; CHECK-NEXT: v_mov_b32_e32 v11, v41
+; CHECK-NEXT: v_mov_b32_e32 v12, v42
+; CHECK-NEXT: v_mov_b32_e32 v13, v43
+; CHECK-NEXT: v_mov_b32_e32 v14, v44
+; CHECK-NEXT: v_mov_b32_e32 v15, v45
+; CHECK-NEXT: v_mov_b32_e32 v16, v46
+; CHECK-NEXT: v_mov_b32_e32 v17, v47
+; CHECK-NEXT: v_mov_b32_e32 v18, v48
+; CHECK-NEXT: v_mov_b32_e32 v19, v49
+; CHECK-NEXT: v_mov_b32_e32 v20, v50
+; CHECK-NEXT: v_mov_b32_e32 v21, v51
+; CHECK-NEXT: v_mov_b32_e32 v22, v52
+; CHECK-NEXT: v_mov_b32_e32 v23, v53
+; CHECK-NEXT: v_mov_b32_e32 v24, v54
+; CHECK-NEXT: v_mov_b32_e32 v25, v55
+; CHECK-NEXT: v_mov_b32_e32 v26, v56
+; CHECK-NEXT: v_mov_b32_e32 v27, v57
+; CHECK-NEXT: v_mov_b32_e32 v28, v58
+; CHECK-NEXT: v_mov_b32_e32 v29, v59
+; CHECK-NEXT: v_mov_b32_e32 v30, v60
+; CHECK-NEXT: v_mov_b32_e32 v31, v61
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
%in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
- %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
- %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %in.1, i32 0, i32 0, i32 0)
+ %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0)
%tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> <i32 32, i32 33, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
- %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0)
+ %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %tmp.1, i32 0, i32 0, i32 0)
store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
ret void
}
@@ -146,8 +90,127 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
+; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v0, s[0:1] offset:112
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v0, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v0, s[0:1] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v0, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %in.1, i32 0, i32 0, i32 0)
+ %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+ %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0)
+ store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2(ptr addrspace(1) %arg) #0 {
+; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+ %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0)
+ store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
+ ret void
+}
+
+define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2(ptr addrspace(1) %arg) #0 {
+; CHECK-LABEL: test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: v_mov_b32_e32 v32, 1.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 2.0
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 1.0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
+ %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> splat (float 1.000000e+00), i32 0, i32 0, i32 0)
+ %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.1, i32 0, i32 0, i32 0)
+ %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.000000e+00, float 2.000000e+00, <32 x float> %mai.2, i32 0, i32 0, i32 0)
+ store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
+ ret void
+}
+
+; The inline asm requires the value be copied to an AGPR class, not
+; the AV_* pseudo we usually expect for register allocator live range
+; splits.
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_to_agpr_class(ptr addrspace(1) %arg) #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_to_agpr_class:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
+; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
@@ -157,39 +220,700 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle(
; CHECK-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
; CHECK-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
; CHECK-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
-; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
+ %in = load <32 x float>, ptr addrspace(1) %gep, align 128
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %in, i32 0, i32 0, i32 0)
+ call void asm sideeffect "; use $0", "a"(<32 x float> %mai)
+ ret void
+}
+
+define void @test_rewrite_mfma_imm_src2(float %arg0, float %arg1) #0 {
+; CHECK-LABEL: test_rewrite_mfma_imm_src2:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 2.0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %arg0, float %arg1, <32 x float> splat (float 2.0), i32 0, i32 0, i32 0)
+ call void asm sideeffect "; use $0", "a"(<32 x float> %mai)
+ ret void
+}
+
+define void @test_rewrite_mfma_subreg_extract0(float %arg0, float %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_extract0:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %src2 = load <32 x float>, ptr addrspace(1) %ptr
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %arg0, float %arg1, <32 x float> %src2, i32 0, i32 0, i32 0)
+ %extract.sub4 = shufflevector <32 x float> %mai, <32 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ call void asm sideeffect "; use $0", "a"(<4 x float> %extract.sub4)
+ ret void
+}
+
+define void @test_rewrite_mfma_subreg_extract1(float %arg0, float %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_extract1:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[4:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %src2 = load <32 x float>, ptr addrspace(1) %ptr
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %arg0, float %arg1, <32 x float> %src2, i32 0, i32 0, i32 0)
+ %extract.sub4 = shufflevector <32 x float> %mai, <32 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ call void asm sideeffect "; use $0", "a"(<4 x float> %extract.sub4)
+ ret void
+}
+
+; odd offset
+define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_extract2:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 a[28:31], v[2:3], off offset:112
+; CHECK-NEXT: global_load_dwordx4 a[24:27], v[2:3], off offset:96
+; CHECK-NEXT: global_load_dwordx4 a[20:23], v[2:3], off offset:80
+; CHECK-NEXT: global_load_dwordx4 a[16:19], v[2:3], off offset:64
+; CHECK-NEXT: global_load_dwordx4 a[12:15], v[2:3], off offset:48
+; CHECK-NEXT: global_load_dwordx4 a[8:11], v[2:3], off offset:32
+; CHECK-NEXT: global_load_dwordx4 a[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1
+; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2
+; CHECK-NEXT: v_accvgpr_mov_b32 a2, a3
+; CHECK-NEXT: v_accvgpr_mov_b32 a3, a4
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %src2 = load <32 x float>, ptr addrspace(1) %ptr
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %arg0, float %arg1, <32 x float> %src2, i32 0, i32 0, i32 0)
+ %extract.sub4 = shufflevector <32 x float> %mai, <32 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ call void asm sideeffect "; use $0", "a"(<4 x float> %extract.sub4)
+ ret void
+}
+
+define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
+; CHECK-LABEL: illegal_mfma_after_rewrite:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v2
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v3
+; CHECK-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; CHECK-NEXT: v_mov_b32_e32 v5, v4
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
+; CHECK-NEXT: v_mov_b32_e32 v7, v4
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[4:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: v_cvt_f16_f32_e32 v23, v14
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a2
+; CHECK-NEXT: v_mov_b64_e32 v[20:21], 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a0
+; CHECK-NEXT: v_cvt_f16_f32_e32 v15, v22
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
+; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v0
+; CHECK-NEXT: global_store_short v[20:21], v23, off
+; CHECK-NEXT: buffer_wbl2 sc0 sc1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_inv sc0 sc1
+; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
+; CHECK-NEXT: global_store_short v[20:21], v15, off
+; CHECK-NEXT: buffer_wbl2 sc0 sc1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_inv sc0 sc1
+; CHECK-NEXT: global_store_short v[20:21], v14, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT: buffer_wbl2 sc0 sc1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_inv sc0 sc1
+; CHECK-NEXT: global_store_short v[20:21], v14, off
+; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT: buffer_wbl2 sc0 sc1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_inv sc0 sc1
+; CHECK-NEXT: global_store_short v[20:21], v12, off
+; CHECK-NEXT: buffer_wbl2 sc0 sc1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_inv sc0 sc1
+; CHECK-NEXT: global_store_short v[20:21], v0, off
+; CHECK-NEXT: s_endpgm
+entry:
+ %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
+ %i2 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %k0, i32 0, i32 0, i32 0)
+ %i4 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH3C00), <4 x float> %k0, i32 0, i32 0, i32 0)
+ %i6 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH7E00), <4 x float> %k0, i32 0, i32 0, i32 0)
+ %i5 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> splat (float 0x7FF8000000000000), i32 0, i32 0, i32 0)
+ %k = call <4 x float> asm sideeffect "; def $0", "=v"()
+ %i1 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %k, i32 0, i32 0, i32 0)
+ %i7 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> splat (half 0xH3C00), <4 x float> %k, i32 0, i32 0, i32 0)
+ %i17 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i1, i32 0, i32 0, i32 0)
+ %i19 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i4, i32 0, i32 0, i32 0)
+ %c_thread_buf.0 = extractelement <4 x float> %i19, i64 0
+ %conv.0 = fptrunc float %c_thread_buf.0 to half
+ store half %conv.0, ptr addrspace(1) null, align 2
+ fence seq_cst
+ %i22 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i5, i32 0, i32 0, i32 0)
+ %c_thread_buf.1 = extractelement <4 x float> %i22, i64 0
+ %conv1 = fptrunc float %c_thread_buf.1 to half
+ store half %conv1, ptr addrspace(1) null, align 2
+ fence seq_cst
+ %i23 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i6, i32 0, i32 0, i32 0)
+ %c_thread_buf.2 = extractelement <4 x float> %i23, i64 0
+ %conv2 = fptrunc float %c_thread_buf.2 to half
+ store half %conv2, ptr addrspace(1) null, align 2
+ fence seq_cst
+ %i25 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i2, i32 0, i32 0, i32 0)
+ %c_thread_buf.3 = extractelement <4 x float> %i25, i64 0
+ %conv3 = fptrunc float %c_thread_buf.3 to half
+ store half %conv3, ptr addrspace(1) null, align 2
+ fence seq_cst
+ %i26 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> %i7, i32 0, i32 0, i32 0)
+ %i27 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> splat (half 0xH3C00), <4 x half> zeroinitializer, <4 x float> %i26, i32 0, i32 0, i32 0)
+ %c_thread_buf.4 = extractelement <4 x float> %i27, i64 0
+ %conv4 = fptrunc float %c_thread_buf.4 to half
+ store half %conv4, ptr addrspace(1) null, align 2
+ fence seq_cst
+ %i31 = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> splat (half 0xH7E00), <4 x half> zeroinitializer, <4 x float> %i17, i32 0, i32 0, i32 0)
+ %c_thread_buf.5 = extractelement <4 x float> %i31, i64 0
+ %conv5 = fptrunc float %c_thread_buf.5 to half
+ store half %conv5, ptr addrspace(1) null, align 2
+ ret void
+}
+
+define void @test_rewrite_mfma_subreg_insert0(float %arg0, float %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_insert0:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %src2 = load <4 x float>, ptr addrspace(1) %ptr
+ %mai = call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %arg0, float %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
+ %insert.sub0 = shufflevector <4 x float> %mai, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "a"(<8 x float> %insert.sub0)
+ ret void
+}
+
+; odd offset
+define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_insert1:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT: s_nop 3
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v1
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %src2 = load <4 x float>, ptr addrspace(1) %ptr
+ %mai = call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %arg0, float %arg1, <4 x float> %src2, i32 0, i32 0, i32 0)
+ %insert.sub0 = shufflevector <4 x float> %mai, <4 x float> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+ call void asm sideeffect "; use $0", "a"(<8 x float> %insert.sub0)
+ ret void
+}
+
+define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_subreg_insert2:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx2 a[0:1], v[4:5], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %src2 = load double, ptr addrspace(1) %ptr
+ %mai = call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %arg0, double %arg1, double %src2, i32 0, i32 0, i32 0)
+ %insert.sub0 = insertelement <2 x double> poison, double %mai, i32 0
+ call void asm sideeffect "; use $0", "a"(<2 x double> %insert.sub0)
+ ret void
+}
+
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_write_b32 a32, v0
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a31
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v55, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v54, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v53, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v52, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v51, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v50, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v49, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v48, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v39, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v38, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v37, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v36, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v35, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v34, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v33, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v32, a0
+; CHECK-NEXT: v_accvgpr_write_b32 a0, 2.0
+; CHECK-NEXT: v_accvgpr_write_b32 a1, 4.0
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v32
+; CHECK-NEXT: v_accvgpr_write_b32 a1, v33
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v34
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v35
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v36
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v37
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v38
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v39
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v40
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v41
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v42
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v43
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v44
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v45
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v46
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v47
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v48
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v49
+; CHECK-NEXT: v_accvgpr_write_b32 a18, v50
+; CHECK-NEXT: v_accvgpr_write_b32 a19, v51
+; CHECK-NEXT: v_accvgpr_write_b32 a20, v52
+; CHECK-NEXT: v_accvgpr_write_b32 a21, v53
+; CHECK-NEXT: v_accvgpr_write_b32 a22, v54
+; CHECK-NEXT: v_accvgpr_write_b32 a23, v55
+; CHECK-NEXT: v_accvgpr_write_b32 a24, v56
+; CHECK-NEXT: v_accvgpr_write_b32 a25, v57
+; CHECK-NEXT: v_accvgpr_write_b32 a26, v58
+; CHECK-NEXT: v_accvgpr_write_b32 a27, v59
+; CHECK-NEXT: v_accvgpr_write_b32 a28, v60
+; CHECK-NEXT: v_accvgpr_write_b32 a29, v61
+; CHECK-NEXT: v_accvgpr_write_b32 a30, v62
+; CHECK-NEXT: v_accvgpr_write_b32 a31, v63
+; CHECK-NEXT: v_accvgpr_read_b32 v32, a32
+; CHECK-NEXT: v_mov_b32_e32 v33, 0x41000000
+; CHECK-NEXT: v_and_b32_e32 v32, 0x3ff, v32
+; CHECK-NEXT: v_lshlrev_b32_e32 v32, 7, v32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a31
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31]
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 7
; CHECK-NEXT: s_nop 1
-; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; CHECK-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; CHECK-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; CHECK-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; CHECK-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; CHECK-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; CHECK-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3]
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
; CHECK-NEXT: s_endpgm
-bb:
+ %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
+ %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
+ %mai1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 8.0, float 16.0, <32 x float> %src2, i32 0, i32 0, i32 0)
%id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr <32 x float>, ptr addrspace(1) %arg, i32 %id
- %in.1 = load <32 x float>, ptr addrspace(1) %gep, align 128
- %mai.1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
- %mai.2 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
- %mai.3 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.2, i32 0, i32 0, i32 0)
- store <32 x float> %mai.3, ptr addrspace(1) %arg, align 128
+ %gep0 = getelementptr <32 x float>, ptr addrspace(1) %arg0, i32 %id
+ store <32 x float> %mai0, ptr addrspace(1) %gep0, align 128
+ %gep1 = getelementptr <32 x float>, ptr addrspace(1) %arg1, i32 %id
+ store <32 x float> %mai1, ptr addrspace(1) %gep1, align 128
+ ret void
+}
+
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(ptr addrspace(1) %arg0) #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class_chain:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v34, 4.0
+; CHECK-NEXT: v_accvgpr_read_b32 v33, a31
+; CHECK-NEXT: v_accvgpr_read_b32 v32, a30
+; CHECK-NEXT: v_accvgpr_read_b32 v31, a29
+; CHECK-NEXT: v_accvgpr_read_b32 v30, a28
+; CHECK-NEXT: v_accvgpr_read_b32 v29, a27
+; CHECK-NEXT: v_accvgpr_read_b32 v28, a26
+; CHECK-NEXT: v_accvgpr_read_b32 v27, a25
+; CHECK-NEXT: v_accvgpr_read_b32 v26, a24
+; CHECK-NEXT: v_accvgpr_read_b32 v25, a23
+; CHECK-NEXT: v_accvgpr_read_b32 v24, a22
+; CHECK-NEXT: v_accvgpr_read_b32 v23, a21
+; CHECK-NEXT: v_accvgpr_read_b32 v22, a20
+; CHECK-NEXT: v_accvgpr_read_b32 v21, a19
+; CHECK-NEXT: v_accvgpr_read_b32 v20, a18
+; CHECK-NEXT: v_accvgpr_read_b32 v19, a17
+; CHECK-NEXT: v_accvgpr_read_b32 v18, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33]
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT: v_mov_b32_e32 v34, 0x41800000
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v1, v34, v[2:33]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_endpgm
+ %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
+ %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
+ %mai1 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 8.0, float 16.0, <32 x float> %mai0, i32 0, i32 0, i32 0)
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <32 x float>, ptr addrspace(1) %arg0, i32 %id
+ store <32 x float> %mai1, ptr addrspace(1) %gep0, align 128
+ ret void
+}
+
+; Untied case
+define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64(double %arg0, double %arg1, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_and_b32_e32 v8, 0x3ff, v31
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a0
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v8
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; CHECK-NEXT: s_nop 5
+; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %src2 = call double asm sideeffect "; def $0", "=a"()
+ %mai = call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %arg0, double %arg1, double %src2, i32 0, i32 0, i32 0)
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr double, ptr addrspace(1) %ptr, i32 %id
+ store double %mai, ptr addrspace(1) %gep0, align 8
+ ret void
+}
+
+define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %arg0, double %arg1, double %arg2, double %arg3, ptr addrspace(1) %ptr) #0 {
+; CHECK-LABEL: test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:1]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a0
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[10:11]
+; CHECK-NEXT: v_and_b32_e32 v2, 0x3ff, v31
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[4:5], v[6:7], v[0:1]
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %src2 = call double asm sideeffect "; def $0", "=a"()
+ %mai0 = call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %arg0, double %arg1, double %src2, i32 0, i32 0, i32 0)
+ %mai1 = call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %arg2, double %arg3, double %mai0, i32 0, i32 0, i32 0)
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr double, ptr addrspace(1) %ptr, i32 %id
+ store double %mai1, ptr addrspace(1) %gep0, align 8
+ ret void
+}
+
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg(ptr addrspace(1) %arg) #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class_subreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v18, 4.0
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a1
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a0
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_endpgm
+ %def = call <32 x float> asm sideeffect "; def $0", "=a"()
+ %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mai = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 2.0, float 4.0, <16 x float> %src2, i32 0, i32 0, i32 0)
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ store <16 x float> %mai, ptr addrspace(1) %gep, align 64
+ ret void
+}
+
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_subreg_odd(ptr addrspace(1) %arg) #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class_subreg_odd:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v1, 2.0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v18, 4.0
+; CHECK-NEXT: v_accvgpr_read_b32 v17, a16
+; CHECK-NEXT: v_accvgpr_read_b32 v16, a15
+; CHECK-NEXT: v_accvgpr_read_b32 v15, a14
+; CHECK-NEXT: v_accvgpr_read_b32 v14, a13
+; CHECK-NEXT: v_accvgpr_read_b32 v13, a12
+; CHECK-NEXT: v_accvgpr_read_b32 v12, a11
+; CHECK-NEXT: v_accvgpr_read_b32 v11, a10
+; CHECK-NEXT: v_accvgpr_read_b32 v10, a9
+; CHECK-NEXT: v_accvgpr_read_b32 v9, a8
+; CHECK-NEXT: v_accvgpr_read_b32 v8, a7
+; CHECK-NEXT: v_accvgpr_read_b32 v7, a6
+; CHECK-NEXT: v_accvgpr_read_b32 v6, a5
+; CHECK-NEXT: v_accvgpr_read_b32 v5, a4
+; CHECK-NEXT: v_accvgpr_read_b32 v4, a3
+; CHECK-NEXT: v_accvgpr_read_b32 v3, a2
+; CHECK-NEXT: v_accvgpr_read_b32 v2, a1
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT: v_mfma_f32_16x16x1_4b_f32 v[2:17], v1, v18, v[2:17]
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_nop 7
+; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_endpgm
+ %def = call <32 x float> asm sideeffect "; def $0", "=a"()
+ %src2 = shufflevector <32 x float> %def, <32 x float> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ %mai = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 2.0, float 4.0, <16 x float> %src2, i32 0, i32 0, i32 0)
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ store <16 x float> %mai, ptr addrspace(1) %gep, align 64
+ ret void
+}
+
+; a->v->mfma->a
+define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_copy_back() #0 {
+; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class_copy_back:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v32, 2.0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v33, 4.0
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v33, a[0:31]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_endpgm
+ %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
+ %mai = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
+ call void asm sideeffect "; use $0", "a"(<32 x float> %mai)
ret void
}
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #1
-declare noundef i32 @llvm.amdgcn.workitem.id.x() #2
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) #2
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #2
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) #2
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #3
-attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,4" }
-attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="4,4" }
+attributes #1 = { mustprogress nofree norecurse nounwind willreturn "amdgpu-waves-per-eu"="8,8" }
+attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 1f2b3e2..03a666f 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -1,31 +1,40 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=SOUT %s
-@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+%class.ExpAmdWorkgroupWaveBarrier = type { target("amdgcn.named.barrier", 0) }
+
+@bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison
@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
-@bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison
-; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+; CHECK: @bar2 = internal addrspace(3) global [2 x target("amdgcn.named.barrier", 0)] poison, !absolute_symbol !0
; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
-; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
-; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global [4 x %class.ExpAmdWorkgroupWaveBarrier] poison, !absolute_symbol !2
+; SOUT: .set func1.num_named_barrier, 7
define void @func1() {
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
+; SOUT: .set func2.num_named_barrier, 2
define void @func2() {
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
ret void
}
+; SOUT: .amdhsa_named_barrier_count 2
+; SOUT: .set kernel1.num_named_barrier, max(6, func1.num_named_barrier, func2.num_named_barrier)
define amdgpu_kernel void @kernel1() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
- call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
call void @llvm.amdgcn.s.barrier()
call void @func1()
@@ -33,9 +42,12 @@ define amdgpu_kernel void @kernel1() #0 {
ret void
}
+; SOUT: .amdhsa_named_barrier_count 2
+; SOUT: .set kernel2.num_named_barrier, max(6, func2.num_named_barrier)
define amdgpu_kernel void @kernel2() #0 {
; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
call void @llvm.amdgcn.s.barrier.wait(i16 1)
call void @func2()
@@ -47,6 +59,9 @@ declare void @llvm.amdgcn.s.barrier.wait(i16) #1
declare void @llvm.amdgcn.s.barrier.signal(i32) #1
declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
@@ -55,5 +70,5 @@ attributes #1 = { convergent nounwind }
attributes #2 = { nounwind readnone }
; CHECK: !0 = !{i32 8396816, i32 8396817}
-; CHECK-NEXT: !1 = !{i32 8396848, i32 8396849}
-; CHECK-NEXT: !2 = !{i32 8396832, i32 8396833}
+; CHECK-NEXT: !1 = !{i32 8396912, i32 8396913}
+; CHECK-NEXT: !2 = !{i32 8396848, i32 8396849}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
new file mode 100644
index 0000000..a4fa8e4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+@bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+@bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+
+define void @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
+; GFX12-GISEL-NEXT: s_barrier_join 3
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+; GFX12-SDAG-LABEL: func2:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func2:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
+; GFX12-GISEL-NEXT: s_barrier_join 1
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX12-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT: s_barrier_init m0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
+; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_lshr_b32 s2, s2, 4
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 63
+; GFX12-SDAG-NEXT: s_or_b32 s3, 0x90000, s2
+; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX12-SDAG-NEXT: s_barrier_init m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0002
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s3
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT: s_barrier_signal -1
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_barrier_leave
+; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT: s_get_barrier_state s2, m0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func1@gotpcrel32@lo+8
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func1@gotpcrel32@hi+16
+; GFX12-SDAG-NEXT: s_barrier_signal -1
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_barrier_wait -1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-SDAG-NEXT: s_getpc_b64 s[2:3]
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-SDAG-NEXT: s_add_co_u32 s2, s2, func2@gotpcrel32@lo+12
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s3, s3, func2@gotpcrel32@hi+24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel1:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_mov_b64 s[12:13], s[4:5]
+; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[12:13], 0x2c
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT: s_barrier_init m0
+; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
+; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT: s_barrier_init m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0002
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
+; GFX12-GISEL-NEXT: s_barrier_signal -1
+; GFX12-GISEL-NEXT: s_barrier_join m0
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_barrier_leave
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1@gotpcrel32@lo+8
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1@gotpcrel32@hi+16
+; GFX12-GISEL-NEXT: s_barrier_signal -1
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_barrier_wait -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s13, 0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2@gotpcrel32@lo+12
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2@gotpcrel32@hi+24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ call void @llvm.amdgcn.s.barrier.leave(i16 1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
+ %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel2:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX12-SDAG-NEXT: s_getpc_b64 s[6:7]
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s7, s7
+; GFX12-SDAG-NEXT: s_add_co_u32 s6, s6, func2@gotpcrel32@lo+8
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s7, s7, func2@gotpcrel32@hi+16
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel2:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s4, 48
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[4:5]
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-GISEL-NEXT: s_add_co_u32 s4, s4, func2@gotpcrel32@lo+8
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s5, func2@gotpcrel32@hi+16
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT: s_load_b64 s[12:13], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002
+; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_join 2
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir
index 15b2a77..40137ab 100644
--- a/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir
+++ b/llvm/test/CodeGen/AMDGPU/s_add_co_pseudo_lowering.mir
@@ -11,10 +11,8 @@ body: |
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr0, $sgpr1, $sgpr2
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr1
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 019eb2c..4995ce6 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -124,9 +124,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_saddsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3
; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3
; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_saddsat_v2i32:
@@ -442,8 +436,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_i64:
@@ -456,8 +449,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_saddsat_i64:
@@ -470,8 +462,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_saddsat_i64:
@@ -480,12 +471,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_saddsat_i64:
@@ -494,11 +484,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 735720a..725d57d 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -285,7 +285,7 @@ define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -298,7 +298,7 @@ define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
-; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -311,7 +311,7 @@ define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
; GCN-LABEL: flat_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
-; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
@@ -337,12 +337,15 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; SDAG: ; %bb.0: ; %entry
; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
-; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
-; SDAG-NEXT: s_mov_b32 s0, exec_lo
+; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
-; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
; SDAG-NEXT: s_cbranch_execnz .LBB21_3
; SDAG-NEXT: ; %bb.1: ; %Flow
@@ -360,13 +363,16 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; SDAG-NEXT: s_cbranch_execz .LBB21_2
; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
; SDAG-NEXT: s_wait_loadcnt 0x0
; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
-; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; SDAG-NEXT: s_wait_xcnt 0x0
; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; SDAG-NEXT: s_branch .LBB21_5
@@ -374,19 +380,21 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
;
; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
-; GISEL-NEXT: s_mov_b32 s2, exec_lo
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0
; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_xor_b32_e32 v0, s2, v5
+; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5
+; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
; GISEL-NEXT: s_cbranch_execnz .LBB21_3
; GISEL-NEXT: ; %bb.1: ; %Flow
@@ -398,19 +406,22 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
; GISEL-NEXT: s_branch .LBB21_5
; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1
-; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr4
; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
; GISEL-NEXT: s_cbranch_execz .LBB21_2
; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
; GISEL-NEXT: s_wait_loadcnt 0x0
; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1]
-; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
+; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
; GISEL-NEXT: s_wait_xcnt 0x0
; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GISEL-NEXT: s_branch .LBB21_5
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
index 27ecc83..e5db4c6 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
@@ -9,7 +9,7 @@ define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) {
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
- %p = alloca [32 x i32], align 4, addrspace(5)
+ %p = alloca [64 x i32], align 4, addrspace(5)
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
%ret = load float, ptr addrspace(5) %arrayidx, align 4
@@ -284,7 +284,7 @@ define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg
; GCN-LABEL: scratch_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset
+; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
@@ -297,7 +297,7 @@ define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg
; GCN-LABEL: scratch_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
-; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset
+; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
@@ -310,7 +310,7 @@ define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg
; GCN-LABEL: scratch_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
-; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset
+; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll
new file mode 100644
index 0000000..3c7b5bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll
@@ -0,0 +1,38 @@
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED16
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck %s --check-prefixes=CHECK,SPLIT16
+
+@global = addrspace(1) global i32 poison, align 4
+
+; The hardware initializes the registers received as arguments by entry points,
+; so they will be counted even if unused.
+
+; Vectors of i1 are always unpacked
+
+; CHECK-LABEL: vec_of_i1:
+; CHECK: TotalNumSgprs: 8
+define amdgpu_ps void @vec_of_i1(<8 x i1> inreg %v8i1) {
+ ret void
+}
+
+; Vectors of i8 are always unpacked
+
+; CHECK-LABEL: vec_of_i8:
+; CHECK: TotalNumSgprs: 4
+define amdgpu_ps void @vec_of_i8(<4 x i8> inreg %v4i8) {
+ ret void
+}
+
+; Vectors of 16-bit types are packed for newer architectures and unpacked for older ones.
+
+; CHECK-LABEL: vec_of_16_bit_ty:
+; PACKED16: TotalNumSgprs: 3
+; SPLIT16: TotalNumSgprs: 6
+define amdgpu_ps void @vec_of_16_bit_ty(<2 x i16> inreg %v2i16, <4 x half> inreg %v4half) {
+ ret void
+}
+
+; CHECK-LABEL: buffer_fat_ptr:
+; CHECK: TotalNumSgprs: 5
+define amdgpu_ps void @buffer_fat_ptr(ptr addrspace(7) inreg %p) {
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index b52821e..702953c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1298,16 +1298,10 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-NEXT: v_writelane_b32 v5, s37, 3
; GCN-NEXT: v_mov_b32_e32 v4, v3
; GCN-NEXT: v_mov_b32_e32 v3, v1
-; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: ; implicit-def: $sgpr4
-; GCN-NEXT: ; implicit-def: $sgpr4
; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GCN-NEXT: v_mov_b32_e32 v3, v4
-; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5
; GCN-NEXT: flat_load_dwordx4 v[6:9], v[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
index 2daea2b..3b02a2e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
---
@@ -10,7 +11,6 @@ body: |
; GCN-NEXT: liveins: $vgpr0, $vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GCN-NEXT: [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec
; GCN-NEXT: [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]]
@@ -29,3 +29,5 @@ body: |
bb.1:
S_ENDPGM 0
...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll
index 7c75303..c3711a5 100644
--- a/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll
@@ -20,13 +20,11 @@ define i64 @shl_nsw(i64 %arg0, i64 %shift_amt) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
@@ -49,13 +47,11 @@ define i64 @shl_nuw(i64 %arg0, i64 %shift_amt) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
@@ -78,13 +74,11 @@ define i64 @shl_nsw_nuw(i64 %arg0, i64 %shift_amt) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index e83ed89..91b0247 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -19,7 +19,7 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
- %val = atomicrmw and ptr addrspace(1) %castback, i32 3 syncscope("agent") seq_cst
+ %val = atomicrmw and ptr addrspace(1) %castback, i32 3 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 21aa40d..91c88ec 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1528,10 +1528,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-SDAG-TRUE16-NEXT: s_endpgm
;
@@ -1560,10 +1559,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
index 400005a..98ecb4a 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
@@ -24,7 +24,6 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]]
; CHECK-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 killed [[COPY1]], [[COPY2]], implicit $exec
; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_2]], [[V_XOR_B32_e64_]], implicit $exec
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec
; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index 767942b..9b9601e 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -42,7 +42,6 @@ body: |
# GCN-LABEL: name: dead_illegal_virtreg_copy
# GCN: %0:vgpr_32 = COPY $vgpr0
-# GCN: %1:sreg_32_xm0 = IMPLICIT_DEF
# GCN: S_ENDPGM 0, implicit %0
name: dead_illegal_virtreg_copy
@@ -60,7 +59,6 @@ body: |
# GCN-LABEL: name: dead_illegal_physreg_copy
# GCN: %2:vgpr_32 = COPY $vgpr0
-# GCN: %1:sreg_32_xm0 = IMPLICIT_DEF
# GCN: S_ENDPGM 0, implicit %2
name: dead_illegal_physreg_copy
@@ -135,3 +133,5 @@ body: |
SI_RETURN
...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
index a0ea04b1..8326862 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-agprs.mir
@@ -31,9 +31,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_64_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_64 = IMPLICIT_DEF
%2:areg_64_align2 = COPY killed %1
@@ -105,9 +104,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_96_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_96 = IMPLICIT_DEF
%2:areg_96_align2 = COPY killed %1
@@ -234,9 +232,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:areg_128_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:areg_128 = IMPLICIT_DEF
%2:areg_128_align2 = COPY killed %1
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index a54c0ac..5f9b71c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -46,9 +46,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_64 = IMPLICIT_DEF
%2:vreg_64_align2 = COPY killed %1
@@ -148,9 +147,8 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_96 = IMPLICIT_DEF
%2:vreg_96_align2 = COPY killed %1
@@ -326,11 +324,59 @@ body: |
; GFX90A: liveins: $vgpr0_vgpr1
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
- ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
- ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY killed [[DEF]]
- ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+ ; GFX90A-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
%0:vreg_64_align2 = COPY $vgpr0_vgpr1
%1:vreg_128 = IMPLICIT_DEF
%2:vreg_128_align2 = COPY killed %1
GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, implicit $exec
...
+
+# Make sure the alignment requirement is respected for VS_64 operand
+# uses.
+---
+name: aligned_vgpr_vs_64_constraint
+tracksRegLiveness: true
+isSSA: true
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr8_sgpr9
+
+ ; GFX908-LABEL: name: aligned_vgpr_vs_64_constraint
+ ; GFX908: liveins: $vgpr0, $sgpr8_sgpr9
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX908-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ ; GFX908-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0
+ ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX908-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; GFX908-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX908-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ ; GFX908-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90A-LABEL: name: aligned_vgpr_vs_64_constraint
+ ; GFX90A: liveins: $vgpr0, $sgpr8_sgpr9
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[GLOBAL_LOAD_DWORDX3_SADDR:%[0-9]+]]:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR [[COPY]], [[COPY1]], 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX3_SADDR]].sub0
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_SADDR]].sub1_sub2
+ ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
+ ; GFX90A-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed [[REG_SEQUENCE]], 0, killed [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: DS_WRITE_B64_gfx9 [[V_MOV_B32_e32_]], killed [[V_PK_ADD_F32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ %0:sgpr_64 = COPY $sgpr8_sgpr9
+ %1:vgpr_32 = COPY $vgpr0
+ %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3_SADDR %0, %1, 16, 0, implicit $exec :: (load (s96), align 4, addrspace 1)
+ %3:vgpr_32 = COPY %2.sub0
+ %4:vreg_64_align2 = COPY killed %2.sub1_sub2
+ %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %6:vreg_64_align2 = REG_SEQUENCE %3, %subreg.sub0, %5, %subreg.sub1
+ %7:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, killed %6, 0, killed %4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ DS_WRITE_B64_gfx9 %5, killed %7, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir b/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir
new file mode 100644
index 0000000..cb515f8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir
@@ -0,0 +1,71 @@
+# RUN: llc %s -o - -mcpu=gfx1030 -O0 -run-pass=si-pre-allocate-wwm-regs | FileCheck %s
+
+# Simple regression test to make sure DBG_VALUE $noreg does not assert in the pass
+
+# CHECK: $vgpr0 = IMPLICIT_DEF
+# CHECK: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 0, $vgpr0
+
+--- |
+ target triple = "amdgcn-amd-amdpal"
+ %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+
+ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !5 {
+ #dbg_value(i32 poison, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !20)
+ #dbg_value(%dx.types.ResRet.f32 poison, !21, !DIExpression(), !23)
+ ret void, !dbg !24
+ }
+
+ attributes #0 = { memory(readwrite) "amdgpu-prealloc-sgpr-spill-vgprs" }
+
+ !llvm.dbg.cu = !{!0}
+ !llvm.module.flags = !{!3, !4}
+
+ !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "dxcoob 1.7.2308.16 (52da17e29)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2)
+ !1 = !DIFile(filename: "tests\\basic_var.hlsl", directory: "")
+ !2 = !{}
+ !3 = !{i32 2, !"Dwarf Version", i32 5}
+ !4 = !{i32 2, !"Debug Info Version", i32 3}
+ !5 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !6, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
+ !6 = !DISubroutineType(types: !7)
+ !7 = !{null, !8}
+ !8 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint3", file: !1, baseType: !9)
+ !9 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<unsigned int, 3>", file: !1, size: 96, align: 32, elements: !10, templateParams: !15)
+ !10 = !{!11, !13, !14}
+ !11 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !9, file: !1, baseType: !12, size: 32, align: 32, flags: DIFlagPublic)
+ !12 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+ !13 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !9, file: !1, baseType: !12, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+ !14 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !9, file: !1, baseType: !12, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+ !15 = !{!16, !17}
+ !16 = !DITemplateTypeParameter(name: "element", type: !12)
+ !17 = !DITemplateValueParameter(name: "element_count", type: !18, value: i32 3)
+ !18 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+ !19 = !DILocalVariable(name: "dtid", arg: 1, scope: !5, file: !1, line: 7, type: !8)
+ !20 = !DILocation(line: 7, column: 17, scope: !5)
+ !21 = !DILocalVariable(name: "my_var", scope: !5, file: !1, line: 11, type: !22)
+ !22 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+ !23 = !DILocation(line: 11, column: 9, scope: !5)
+ !24 = !DILocation(line: 19, column: 1, scope: !5)
+...
+---
+name: _amdgpu_cs_main
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ hasSpilledSGPRs: true
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 16
+ sgprForEXECCopy: '$sgpr12_sgpr13'
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %2:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 0, %2
+ renamable $sgpr3 = COPY killed $sgpr2
+ renamable $sgpr4 = S_MOV_B32 6
+ %3:vgpr_32 = V_LSHL_ADD_U32_e64 killed $sgpr3, killed $sgpr4, %0, implicit $exec
+ DBG_VALUE %3, $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32), debug-location !20
+ DBG_VALUE $noreg, $noreg, !21, !DIExpression(), debug-location !23
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index fa482d9..2895031 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -101,14 +101,14 @@ entry:
}
;.
-; NO: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; NO: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
-; OW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; OW: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; OW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
;.
-; CW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 65de7f8..3290bdb 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -58,8 +58,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
;.
; ATTRIBUTOR_GCN: [[META0]] = !{i32 1, i32 5, i32 6, i32 10}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 6a45b96..101787a 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -32,6 +33,16 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i32_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%result = sitofp i32 %in to double
store double %result, ptr addrspace(1) %out
ret void
@@ -73,6 +84,18 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i1_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%fp = sitofp i1 %cmp to double
store double %fp, ptr addrspace(1) %out, align 4
@@ -113,6 +136,19 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i1_f64_load:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bitcmp1_b32 s2, 0
+; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3]
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], v0
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%fp = sitofp i1 %in to double
store double %fp, ptr addrspace(1) %out, align 8
ret void
@@ -150,6 +186,18 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_sint_to_fp_i64_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%result = sitofp i64 %in to double
store double %result, ptr addrspace(1) %out
ret void
@@ -199,6 +247,22 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: v_sint_to_fp_i64_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[2:3], v1
+; GFX942-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%val = load i64, ptr addrspace(1) %gep, align 8
@@ -238,6 +302,17 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_sint_to_fp_i8_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_sext_i32_i8 s2, s2
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%fp = sitofp i8 %in to double
store double %fp, ptr addrspace(1) %out
ret void
@@ -258,6 +333,14 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) {
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sint_to_fp_i8_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX942-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX942-NEXT: v_cvt_f64_i32_e32 v[0:1], v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%fp = sitofp i8 %in to double
ret double %fp
}
@@ -296,6 +379,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_select_sint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double -1.0, double 0.0
store double %select, ptr addrspace(1) %out, align 8
@@ -313,6 +408,18 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_sint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double -1.0, double 0.0
store double %select, ptr addrspace(1) %out, align 8
@@ -353,6 +460,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_select_sint_to_fp_i1_vals_i64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
store i64 %select, ptr addrspace(1) %out, align 8
@@ -370,6 +489,18 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_sint_to_fp_i1_vals_i64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
store i64 %select, ptr addrspace(1) %out, align 8
@@ -388,6 +519,18 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_swap_select_sint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 0.0, double -1.0
store double %select, ptr addrspace(1) %out, align 8
@@ -429,6 +572,18 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_swap_select_sint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0, 0xbff00000
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 0.0, double -1.0
store double %select, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 04f73a3..586579f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -56,7 +56,6 @@ define void @test() {
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; CHECK-NEXT: s_mov_b32 s4, 1
-; CHECK-NEXT: ; implicit-def: $sgpr5
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
diff --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
index fee9f8e..18a991c 100644
--- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
@@ -5,6 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx904 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck -check-prefixes=GCN,ECC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck -check-prefixes=GCN,NO-ECC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,ECC %s
; Make sure the correct set of targets are marked with
; FeatureDoesNotSupportSRAMECC, and +sramecc is ignored if it's never
@@ -12,7 +13,7 @@
; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
; NO-ECC: global_load_short_d16_hi
-; ECC: global_load_ushort
+; ECC: global_load_{{ushort|u16}}
define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) {
entry:
%gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll
index a3fed31..359c323 100644
--- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s
; REQUIRES: asserts
diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll
index bd665eb..ea6e456 100644
--- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll
+++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s
; REQUIRES: asserts
diff --git a/llvm/test/CodeGen/AMDGPU/srl64_reduce_flags.ll b/llvm/test/CodeGen/AMDGPU/srl64_reduce_flags.ll
index ca4b728..b3084d7 100644
--- a/llvm/test/CodeGen/AMDGPU/srl64_reduce_flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl64_reduce_flags.ll
@@ -20,14 +20,12 @@ define i64 @srl_exact(i64 %arg0, i64 %shift_amt) {
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = exact V_LSHRREV_B32_e64 killed [[COPY5]], killed [[COPY3]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 40d80f5..09c0e77 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -124,9 +124,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v2i32:
@@ -439,23 +433,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v3i32:
@@ -465,23 +456,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v3i32:
@@ -511,30 +499,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v4i32:
@@ -544,30 +528,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v4i32:
@@ -599,58 +579,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4
; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5
; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6
; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7
; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v8i32:
@@ -660,58 +632,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v8i32:
@@ -751,116 +715,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v4, v20
+; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v17
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6
; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7
; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8
; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9
; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10
; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11
; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12
; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13
; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14
; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16
; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15
; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15
-; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v16i32:
@@ -870,116 +818,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v4, v20
+; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v17
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8
; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9
; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10
; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11
; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13
; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14
; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16
; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16
; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15
-; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT: s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_v16i32:
@@ -1066,8 +998,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_i64:
@@ -1080,8 +1011,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_ssubsat_i64:
@@ -1094,8 +1024,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_ssubsat_i64:
@@ -1104,12 +1033,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_ssubsat_i64:
@@ -1118,11 +1046,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
deleted file mode 100644
index 0bfc45c..0000000
--- a/llvm/test/CodeGen/AMDGPU/store-to-constant-error.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=SDAG %s
-; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=GISEL %s
-
-; SDAG: LLVM ERROR: Cannot select: {{[a-z0-9]+}}: ch = store<(store (s32) into %ir.ptr.load, addrspace 4)>
-; GISEL: LLVM ERROR: cannot select: G_STORE %{{[0-9]+}}:vgpr(s32), %{{[0-9]+}}:vgpr(p4) :: (store (s32) into %ir.ptr.load, addrspace 4) (in function: store_to_constant_i32)
-define amdgpu_kernel void @store_to_constant_i32(ptr addrspace(4) %ptr) {
-bb:
- store i32 1, ptr addrspace(4) %ptr, align 4
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/store-to-constant.ll b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
new file mode 100644
index 0000000..9b3b520
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/store-to-constant.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s
+
+; FIXME: We need to test AS6 but the AS6 variants of the following tests fail because of illegal VGPR to SGPR copy.
+; FIXME: We also want to test memset, memcpy, and memmove, but it needs to fix the SelectionDAG store merging issue (#90714).
+
+define amdgpu_kernel void @store_as4_i8(ptr addrspace(4) %p, i8 %v) {
+; CHECK-LABEL: store_as4_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_byte v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store i8 %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_i16(ptr addrspace(4) %p, i16 %v) {
+; CHECK-LABEL: store_as4_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_short v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store i16 %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_i32(ptr addrspace(4) %p, i32 %v) {
+; CHECK-LABEL: store_as4_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store i32 %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_i64(ptr addrspace(4) %p, i64 %v) {
+; CHECK-LABEL: store_as4_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+ store i64 %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_float(ptr addrspace(4) %p, float %v) {
+; CHECK-LABEL: store_as4_float:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store float %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_double(ptr addrspace(4) %p, double %v) {
+; CHECK-LABEL: store_as4_double:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+ store double %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_half(ptr addrspace(4) %p, half %v) {
+; CHECK-LABEL: store_as4_half:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_short v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store half %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xi8(ptr addrspace(4) %p, <2 x i8> %v) {
+; CHECK-LABEL: store_as4_2xi8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_short v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store <2 x i8> %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xi16(ptr addrspace(4) %p, <2 x i16> %v) {
+; CHECK-LABEL: store_as4_2xi16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store <2 x i16> %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xi32(ptr addrspace(4) %p, <2 x i32> %v) {
+; CHECK-LABEL: store_as4_2xi32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+ store <2 x i32> %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xhalf(ptr addrspace(4) %p, <2 x half> %v) {
+; CHECK-LABEL: store_as4_2xhalf:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; CHECK-NEXT: s_endpgm
+ store <2 x half> %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xfloat(ptr addrspace(4) %p, <2 x float> %v) {
+; CHECK-LABEL: store_as4_2xfloat:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
+; CHECK-NEXT: s_endpgm
+ store <2 x float> %v, ptr addrspace(4) %p
+ ret void
+}
+
+define amdgpu_kernel void @store_as4_2xdouble(ptr addrspace(4) %p, <2 x double> %v) {
+; CHECK-LABEL: store_as4_2xdouble:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[4:5]
+; CHECK-NEXT: s_endpgm
+ store <2 x double> %v, ptr addrspace(4) %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
index 42436a1..b403651 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize-hoist.ll
@@ -178,3 +178,94 @@ latch:
end:
ret void
}
+
+define void @test_nested_if(ptr %ptr, i32 %val, i1 %cond) {
+; GFX900-LABEL: test_nested_if:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: flat_load_dword v4, v[0:1]
+; GFX900-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v3
+; GFX900-NEXT: s_mov_b64 s[10:11], -1
+; GFX900-NEXT: s_xor_b64 s[4:5], s[6:7], -1
+; GFX900-NEXT: s_mov_b64 s[12:13], s[6:7]
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v3, v4
+; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB3_4
+; GFX900-NEXT: ; %bb.1: ; %if
+; GFX900-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GFX900-NEXT: s_cbranch_execz .LBB3_3
+; GFX900-NEXT: ; %bb.2: ; %if_2
+; GFX900-NEXT: flat_load_dword v3, v[0:1]
+; GFX900-NEXT: s_xor_b64 s[10:11], exec, -1
+; GFX900-NEXT: .LBB3_3: ; %Flow3
+; GFX900-NEXT: s_or_b64 exec, exec, s[12:13]
+; GFX900-NEXT: s_andn2_b64 s[12:13], s[6:7], exec
+; GFX900-NEXT: s_and_b64 s[10:11], s[10:11], exec
+; GFX900-NEXT: s_or_b64 s[12:13], s[12:13], s[10:11]
+; GFX900-NEXT: .LBB3_4: ; %Flow2
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[12:13]
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX900-NEXT: s_cbranch_execz .LBB3_8
+; GFX900-NEXT: ; %bb.5: ; %if_3
+; GFX900-NEXT: s_movk_i32 s6, 0xfe
+; GFX900-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2
+; GFX900-NEXT: s_mov_b64 s[6:7], -1
+; GFX900-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GFX900-NEXT: s_cbranch_execz .LBB3_7
+; GFX900-NEXT: ; %bb.6: ; %if_4
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_add_u32_e32 v4, 1, v3
+; GFX900-NEXT: s_xor_b64 s[6:7], exec, -1
+; GFX900-NEXT: .LBB3_7: ; %Flow1
+; GFX900-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX900-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
+; GFX900-NEXT: s_and_b64 s[6:7], s[6:7], exec
+; GFX900-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX900-NEXT: .LBB3_8: ; %Flow
+; GFX900-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX900-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX900-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT: flat_store_dword v[0:1], v4
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %load = load %pair, ptr %ptr
+ br i1 %cond, label %else, label %if
+
+if:
+ %a16 = icmp slt i32 %val, 255
+ br i1 %cond, label %else, label %if_2
+
+if_2:
+ %loaded = load i32, ptr %ptr
+ br label %merge
+
+else:
+ %a_else = extractvalue %pair %load, 0
+ br label %merge
+
+merge:
+ %phi = phi i32 [ %loaded, %if_2 ], [ %a_else, %else ]
+ br i1 %cond, label %if_3, label %else_2
+
+if_3:
+ %a17 = icmp slt i32 %val, 255
+ br i1 %a17, label %else_2, label %if_4
+
+if_4:
+ %sum_load = add i32 %phi, 1
+ br label %merge_2
+
+else_2:
+ %a_else_2 = extractvalue %pair %load, 0
+ br label %merge_2
+
+merge_2:
+ %phi_2 = phi i32 [ %sum_load, %if_4 ], [ %a_else_2, %else_2 ]
+ store i32 %phi_2, ptr %ptr
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll b/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
deleted file mode 100644
index 726e35d..0000000
--- a/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s
-
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-
-
-define amdgpu_kernel void @test_isel_single_lane(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test_isel_single_lane:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x58
-; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
-; GCN-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: v_readfirstlane_b32 s0, v1
-; GCN-NEXT: s_addk_co_i32 s0, 0xf4
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_lshl_b32 s1, s0, 4
-; GCN-NEXT: s_mul_i32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT: s_lshl_b32 s0, s0, 12
-; GCN-NEXT: s_sub_co_i32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
-; GCN-NEXT: s_endpgm
- %gep0 = getelementptr i32, ptr addrspace(1) %in, i32 22
- %val0 = load i32, ptr addrspace(1) %gep0, align 4
- %gep1 = getelementptr i32, ptr addrspace(1) %in, i32 4
- %val1 = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr addrspace(1) %gep1, i32 %val0)
- %res0 = add i32 %val1, 244
- %res1 = shl i32 %res0, 4
- %res2 = mul i32 %res0, %res1
- %res3 = shl i32 %res2, 12
- %res4 = sub i32 %res1, %res3
- store i32 %res4, ptr addrspace(1) %out
- ret void
-}
-
-
-attributes #0 = {
- "amdgpu-flat-work-group-size"="1,1"
- "amdgpu-waves-per-eu"="1,1"
- "uniform-work-group-size"="true"
-}
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 469ea24..9c0beb2 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -223,7 +223,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX1100-O0-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b32 s0, -1
-; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $sgpr1
; HSA-TRAP-GFX1100-O0-NEXT: v_cmp_eq_u32_e64 s0, v0, s0
; HSA-TRAP-GFX1100-O0-NEXT: s_and_b32 vcc_lo, exec_lo, s0
; HSA-TRAP-GFX1100-O0-NEXT: s_cbranch_vccnz .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.mir b/llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.mir
index 576cf63..dc9b9d6 100644
--- a/llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.mir
+++ b/llvm/test/CodeGen/AMDGPU/triton_regression_no_waterfall.mir
@@ -69,8 +69,8 @@ frameInfo:
hasTailCall: false
isCalleeSavedInfoValid: false
localFrameSize: 0
- savePoint: ''
- restorePoint: ''
+ savePoint: []
+ restorePoint: []
fixedStack: []
stack: []
entry_values: []
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index f6c357d..d80ec6b 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -83,6 +83,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_mov_b32 s84, s14
; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0
+; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: s_mov_b32 s32, 0
; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
@@ -194,7 +195,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -271,7 +271,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_14
; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23
@@ -297,12 +296,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_1
; GLOBALNESS1-NEXT: .LBB1_28: ; %bb73.i
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS1-NEXT: s_branch .LBB1_2
; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard
@@ -397,6 +394,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_mov_b32 s82, s14
; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11]
; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0
+; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: s_mov_b32 s32, 0
; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
@@ -509,7 +507,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13
; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
@@ -586,7 +583,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14
; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_14
; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23
@@ -610,12 +606,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1
; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_1
; GLOBALNESS0-NEXT: .LBB1_28: ; %bb73.i
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42
; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off
; GLOBALNESS0-NEXT: s_branch .LBB1_2
; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index 8629d54..6cc3960 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after twoaddressinstruction < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after twoaddressinstruction < %s | FileCheck %s
; Check that %16 gets constrained to register class sgpr_96_with_sub0_sub1.
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index ab278c3..983acfc 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -48,6 +49,22 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: v_uint_to_fp_i64_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], v1
+; GFX942-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%val = load i64, ptr addrspace(1) %gep, align 8
@@ -88,6 +105,18 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i64_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cast = uitofp i64 %in to double
store double %cast, ptr addrspace(1) %out, align 8
ret void
@@ -136,6 +165,23 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v2i64_to_v2f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s1
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[4:5], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s0
+; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-NEXT: s_endpgm
%cast = uitofp <2 x i64> %in to <2 x double>
store <2 x double> %cast, ptr addrspace(1) %out, align 16
ret void
@@ -210,6 +256,32 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v4i64_to_v4f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s11
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s10
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s9
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX942-NEXT: v_ldexp_f64 v[0:1], v[4:5], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s8
+; GFX942-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s15
+; GFX942-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[6:7], s14
+; GFX942-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7]
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s13
+; GFX942-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[8:9], s12
+; GFX942-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9]
+; GFX942-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
%cast = uitofp <4 x i64> %in to <4 x double>
store <4 x double> %cast, ptr addrspace(1) %out, align 16
ret void
@@ -243,6 +315,16 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i32_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cast = uitofp i32 %in to double
store double %cast, ptr addrspace(1) %out, align 8
ret void
@@ -262,6 +344,16 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v2i32_to_v2f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
%cast = uitofp <2 x i32> %in to <2 x double>
store <2 x double> %cast, ptr addrspace(1) %out, align 16
ret void
@@ -313,6 +405,20 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v4i32_to_v4f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[4:5], s2
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT: s_endpgm
%cast = uitofp <4 x i32> %in to <4 x double>
store <4 x double> %cast, ptr addrspace(1) %out, align 16
ret void
@@ -354,6 +460,18 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: uint_to_fp_i1_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%fp = uitofp i1 %cmp to double
store double %fp, ptr addrspace(1) %out, align 4
@@ -394,6 +512,19 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: uint_to_fp_i1_to_f64_load:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bitcmp1_b32 s2, 0
+; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%fp = uitofp i1 %in to double
store double %fp, ptr addrspace(1) %out, align 8
ret void
@@ -429,6 +560,17 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i8_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_and_b32 s2, s2, 0xff
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%fp = uitofp i8 %in to double
store double %fp, ptr addrspace(1) %out
ret void
@@ -450,6 +592,14 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) {
; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_uint_to_fp_i8_to_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_mov_b32 s0, 0xffff
+; GFX942-NEXT: v_and_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-NEXT: v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%fp = uitofp i8 %in to double
ret double %fp
}
@@ -488,6 +638,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_select_uint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 1.0, double 0.0
store double %select, ptr addrspace(1) %out, align 8
@@ -505,6 +667,18 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_uint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 1.0, double 0.0
store double %select, ptr addrspace(1) %out, align 8
@@ -545,6 +719,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_select_uint_to_fp_i1_vals_i64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
store i64 %select, ptr addrspace(1) %out, align 8
@@ -562,6 +748,18 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_uint_to_fp_i1_vals_i64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
store i64 %select, ptr addrspace(1) %out, align 8
@@ -603,6 +801,18 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
+;
+; GFX942-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_cmp_eq_u32 s2, 0
+; GFX942-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
+; GFX942-NEXT: v_mov_b32_e32 v1, s2
+; GFX942-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT: s_endpgm
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 0.0, double 1.0
store double %select, ptr addrspace(1) %out, align 8
@@ -620,6 +830,18 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e64 v5, v3, 0, vcc
+; GFX942-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %in, 0
%select = select i1 %cmp, double 0.0, double 1.0
store double %select, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index b3166fa..fb971e4 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -10,33 +10,31 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0
; CHECK-NEXT: flat_load_dword v42, v[44:45]
-; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
-; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
-; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
-; CHECK-NEXT: s_load_dword s64, s[8:9], 0x0
+; CHECK-NEXT: s_load_dwordx4 s[64:67], s[8:9], 0x8
+; CHECK-NEXT: s_load_dword s68, s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_accvgpr_write_b32 a32, s6
-; CHECK-NEXT: v_accvgpr_write_b32 a33, s7
-; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base
-; CHECK-NEXT: s_cmp_lg_u32 s64, -1
-; CHECK-NEXT: s_cselect_b32 s7, s7, 0
-; CHECK-NEXT: s_cselect_b32 s8, s64, 0
+; CHECK-NEXT: s_cmp_lg_u32 s68, -1
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: s_cselect_b32 s5, s5, 0
+; CHECK-NEXT: s_cselect_b32 s6, s68, 0
; CHECK-NEXT: s_add_u32 s50, s34, 48
+; CHECK-NEXT: v_mov_b32_e32 v47, s5
+; CHECK-NEXT: s_mov_b32 s5, s4
; CHECK-NEXT: s_addc_u32 s51, s35, 0
-; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT: v_pk_mov_b32 v[62:63], s[4:5], s[4:5] op_sel:[0,1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b32 s6, 0
-; CHECK-NEXT: v_mov_b32_e32 v47, s7
-; CHECK-NEXT: s_mov_b32 s7, s6
; CHECK-NEXT: s_mov_b32 s53, s14
-; CHECK-NEXT: v_mov_b32_e32 v46, s8
-; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT: v_mov_b32_e32 v46, s6
+; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[64:65], s[64:65] op_sel:[0,1]
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -49,13 +47,15 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s52, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: v_mov_b32_e32 v40, v0
-; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
+; CHECK-NEXT: v_mov_b32_e32 v60, s66
+; CHECK-NEXT: v_mov_b32_e32 v61, s67
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63]
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
-; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57]
-; CHECK-NEXT: v_mov_b32_e32 v62, 0
-; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000
+; CHECK-NEXT: flat_load_dwordx2 v[58:59], v[56:57]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51]
@@ -64,20 +64,22 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
; CHECK-NEXT: s_mov_b32 s13, s52
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_mov_b32_e32 v31, v40
-; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63]
-; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
+; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[0:1]
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[62:63]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15
; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55]
; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s64
-; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
-; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
+; CHECK-NEXT: v_mov_b32_e32 v1, s67
+; CHECK-NEXT: v_mov_b32_e32 v0, s68
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33]
-; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen
+; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61]
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
index 90891cb..f54e001 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-load-from-tid.ll
@@ -98,10 +98,13 @@ entry:
}
; GCN-LABEL: {{^}}lshr_threadid_3d:
-; GCN: global_load_dword
+; W64: global_load_dword
+; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; W32: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
; OPT-LABEL: @lshr_threadid_3d
-; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W64: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4{{$}}
+; OPT-W32: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %div4, !amdgpu.uniform
define amdgpu_kernel void @lshr_threadid_3d(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
entry:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,6 +117,24 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}high_id_uniform:
+; GCN: v_lshlrev_b32_e32 v0, 2, v2
+; GCN: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], [[OFFSET]]
+
+; OPT-LABEL: @high_id_uniform
+; OPT: %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext, !amdgpu.uniform
+define amdgpu_kernel void @high_id_uniform(ptr addrspace(1) align 4 %in, ptr addrspace(1) align 4 %out) !reqd_work_group_size !2 {
+entry:
+ %zid = tail call i32 @llvm.amdgcn.workitem.id.z()
+ %zid.zext = zext nneg i32 %zid to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %zid.zext
+ %load = load i32, ptr addrspace(1) %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %zid.zext
+ store i32 %load, ptr addrspace(1) %arrayidx2, align 4
+ ret void
+}
+
; GCN-LABEL: {{^}}lshr_threadid_1d_uneven:
; W64: global_load_dword
; W32: v_readfirstlane_b32 [[OFFSET:s[0-9]+]], v0
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 0fc655a..473d996 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,5 +31,5 @@ define amdgpu_kernel void @kernel1() #1 {
attributes #0 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index b5c14c5..c9ee40c 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -97,8 +97,8 @@ define amdgpu_kernel void @kernel2() #0 {
attributes #0 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 6464ff3..308f8b5 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
attributes #2 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index ea3c0a3..7e2b085 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel2() #2 {
attributes #1 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index 3823d17..3d6454c 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
attributes #1 = { "uniform-work-group-size"="false" }
attributes #2 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { nounwind "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index e478359..3032d8d 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
attributes #0 = { nounwind readnone }
attributes #1 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 1c054fb..e315e04 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,5 +61,5 @@ define amdgpu_kernel void @kernel3() #0 {
attributes #0 = { "uniform-work-group-size"="false" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/unify-metadata.ll b/llvm/test/CodeGen/AMDGPU/unify-metadata.ll
deleted file mode 100644
index 455993b..0000000
--- a/llvm/test/CodeGen/AMDGPU/unify-metadata.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-unify-metadata -S < %s | FileCheck -check-prefix=ALL %s
-; RUN: opt -mtriple=amdgcn--amdhsa -passes=amdgpu-unify-metadata -S < %s | FileCheck -check-prefix=ALL %s
-
-; This test check that we have a singe metadata value after linking several
-; modules for records such as opencl.ocl.version, llvm.ident and similar.
-
-; ALL-DAG: !opencl.ocl.version = !{![[OCL_VER:[0-9]+]]}
-; ALL-DAG: !llvm.ident = !{![[LLVM_IDENT_0:[0-9]+]], ![[LLVM_IDENT_1:[0-9]+]]}
-; ALL-DAG: !opencl.used.extensions = !{![[USED_EXT_0:[0-9]+]], ![[USED_EXT_1:[0-9]+]], ![[USED_EXT_2:[0-9]+]]}
-
-; ALL-DAG: ![[OCL_VER]] = !{i32 1, i32 2}
-; ALL-DAG: ![[LLVM_IDENT_0]] = !{!"clang version 4.0"}
-; ALL-DAG: ![[LLVM_IDENT_1]] = !{!"clang version 4.0 (rLXXXXXX)"}
-; ALL-DAG: ![[USED_EXT_0]] = !{!"cl_images"}
-; ALL-DAG: ![[USED_EXT_1]] = !{!"cl_khr_fp16"}
-; ALL-DAG: ![[USED_EXT_2]] = !{!"cl_doubles"}
-
-!opencl.ocl.version = !{!1, !0, !0, !0}
-!llvm.ident = !{!2, !2, !2, !2, !6}
-!opencl.used.extensions = !{!3, !3, !4, !5}
-
-!0 = !{i32 2, i32 0}
-!1 = !{i32 1, i32 2}
-!2 = !{!"clang version 4.0"}
-!3 = !{!"cl_images", !"cl_khr_fp16"}
-!4 = !{!"cl_images", !"cl_doubles"}
-!5 = !{}
-!6 = !{!"clang version 4.0 (rLXXXXXX)"}
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index cf5b95a7..bb0ec0d 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -16,7 +16,7 @@ entry:
}
; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll
index 22f0c28..d076685 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-addrspacecast.ll
@@ -7,7 +7,7 @@
; OPT: store i32 0, ptr addrspace(5) %alloca, align 4
; OPT: store i32 1, ptr addrspace(5) %a1, align 4
; OPT: store i32 2, ptr addrspace(5) %a2, align 4
-; OPT: %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+; OPT: %tmp = getelementptr i32, ptr addrspace(5) %alloca, i64 %index
; OPT: %ac = addrspacecast ptr addrspace(5) %tmp to ptr
; OPT: %data = load i32, ptr %ac, align 4
define amdgpu_kernel void @vector_addrspacecast(ptr addrspace(1) %out, i64 %index) {
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
index 30ed6ae..3342151 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
@@ -300,17 +300,15 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -348,17 +346,15 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -518,21 +514,19 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i8:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
@@ -581,21 +575,19 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
@@ -832,28 +824,25 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -922,28 +911,25 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index aab0e76..1d3b42e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -374,13 +374,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -427,13 +426,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -624,22 +622,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -703,22 +699,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1047,14 +1041,12 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1176,14 +1168,12 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l
; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index fe7def8a..11d724e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2190,13 +2190,13 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
; GFX942-LABEL: shuffle_scalar_load_v8i32_0123:
; GFX942: ; %bb.0:
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3]
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: shuffle_scalar_load_v8i32_0123:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 2cb5e30..ee35dc4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
}
; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 32
+; GFX90A: NumVgprs: 1
; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 33
+; GFX90A: TotalNumVgprs: 5
define void @f1024_0() #1024 {
call void @foo()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
new file mode 100644
index 0000000..71d4e6b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED
+; RUN: llc -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
+target triple = "amdgcn-amd-amdhsa"
+
+@global = addrspace(1) global i32 poison, align 4
+
+; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
+; Only hardware-initialized VGPRs (v0) are read in this kernel.
+
+; CHECK-LABEL: amdhsa.kernels:
+; CHECK-LABEL: kernel_x
+; CHECK: .vgpr_count: 1
+define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: kernel_z
+; PACKED: .vgpr_count: 1
+; NOTPACKED: .vgpr_count: 3
+define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.z()
+ call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret void
+}
+
+attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
new file mode 100644
index 0000000..a9f4bda
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s
+target triple = "amdgcn--amdpal"
+
+@global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Shouldn't report the part of %vgpr_args that's not used
+; CHECK-LABEL: entry_point_symbol: cs_calling_chain
+; CHECK: .vgpr_count: 0xa
+define amdgpu_cs void @cs_calling_chain(i32 %vgpr, i32 inreg %sgpr) {
+ %vgpr_args = insertvalue {i32, i32, i32, i32} poison, i32 %vgpr, 1
+ call void (ptr, i32, i32, {i32, i32, i32, i32}, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.s(
+ ptr @chain_func, i32 0, i32 inreg %sgpr, {i32, i32, i32, i32} %vgpr_args, i32 0)
+ unreachable
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: chain_func:
+; CHECK: .vgpr_count: 0x1
+define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
+entry:
+ call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
+ ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
+ unreachable
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
new file mode 100644
index 0000000..ea19763
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s
+target triple = "amdgcn--amdpal"
+
+@global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
+; CHECK-LABEL: .entry_point_symbol: kernel_use
+; CHECK: .vgpr_count: 0x20
+define amdgpu_cs void @kernel_use([32 x i32] %args) {
+entry:
+ %a = extractvalue [32 x i32] %args, 14
+ store i32 %a, ptr addrspace(1) @global
+ ret void
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: gfx_func:
+; CHECK: .vgpr_count: 0x20
+define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
+entry:
+ ret [32 x i32] %args
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
index e0dfdba..67264e9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -13,9 +13,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX900-LABEL: test_tuple:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -31,22 +28,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX900-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX900-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT: v_writelane_b32 v63, s36, 0
-; GFX900-NEXT: v_writelane_b32 v63, s37, 1
-; GFX900-NEXT: v_writelane_b32 v63, s38, 2
-; GFX900-NEXT: v_writelane_b32 v63, s39, 3
-; GFX900-NEXT: v_writelane_b32 v63, s48, 4
-; GFX900-NEXT: v_writelane_b32 v63, s49, 5
-; GFX900-NEXT: v_writelane_b32 v63, s50, 6
-; GFX900-NEXT: v_writelane_b32 v63, s51, 7
-; GFX900-NEXT: v_writelane_b32 v63, s52, 8
-; GFX900-NEXT: v_writelane_b32 v63, s53, 9
-; GFX900-NEXT: v_writelane_b32 v63, s54, 10
-; GFX900-NEXT: v_writelane_b32 v63, s55, 11
-; GFX900-NEXT: v_writelane_b32 v63, s64, 12
-; GFX900-NEXT: v_writelane_b32 v63, s65, 13
-; GFX900-NEXT: v_writelane_b32 v63, s66, 14
-; GFX900-NEXT: v_writelane_b32 v63, s67, 15
; GFX900-NEXT: v_mov_b32_e32 v33, v30
; GFX900-NEXT: v_mov_b32_e32 v34, v29
; GFX900-NEXT: v_mov_b32_e32 v35, v28
@@ -78,38 +59,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX900-NEXT: v_mov_b32_e32 v61, v2
; GFX900-NEXT: v_mov_b32_e32 v62, v1
; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
-; GFX900-NEXT: ; implicit-def: $sgpr4
; GFX900-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
; GFX900-NEXT: v_mov_b32_e32 v1, v62
; GFX900-NEXT: v_mov_b32_e32 v2, v61
@@ -142,24 +91,7 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX900-NEXT: v_mov_b32_e32 v29, v34
; GFX900-NEXT: v_mov_b32_e32 v30, v33
; GFX900-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec
-; GFX900-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
; GFX900-NEXT: v_mov_b32_e32 v0, 0
-; GFX900-NEXT: v_readlane_b32 s67, v63, 15
-; GFX900-NEXT: v_readlane_b32 s66, v63, 14
-; GFX900-NEXT: v_readlane_b32 s65, v63, 13
-; GFX900-NEXT: v_readlane_b32 s64, v63, 12
-; GFX900-NEXT: v_readlane_b32 s55, v63, 11
-; GFX900-NEXT: v_readlane_b32 s54, v63, 10
-; GFX900-NEXT: v_readlane_b32 s53, v63, 9
-; GFX900-NEXT: v_readlane_b32 s52, v63, 8
-; GFX900-NEXT: v_readlane_b32 s51, v63, 7
-; GFX900-NEXT: v_readlane_b32 s50, v63, 6
-; GFX900-NEXT: v_readlane_b32 s49, v63, 5
-; GFX900-NEXT: v_readlane_b32 s48, v63, 4
-; GFX900-NEXT: v_readlane_b32 s39, v63, 3
-; GFX900-NEXT: v_readlane_b32 s38, v63, 2
-; GFX900-NEXT: v_readlane_b32 s37, v63, 1
-; GFX900-NEXT: v_readlane_b32 s36, v63, 0
; GFX900-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -175,18 +107,12 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX900-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX900-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX900-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX900-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX900-NEXT: s_mov_b64 exec, s[4:5]
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: test_tuple:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX906-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT: s_mov_b64 exec, s[4:5]
; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -202,22 +128,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX906-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX906-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT: v_writelane_b32 v63, s36, 0
-; GFX906-NEXT: v_writelane_b32 v63, s37, 1
-; GFX906-NEXT: v_writelane_b32 v63, s38, 2
-; GFX906-NEXT: v_writelane_b32 v63, s39, 3
-; GFX906-NEXT: v_writelane_b32 v63, s48, 4
-; GFX906-NEXT: v_writelane_b32 v63, s49, 5
-; GFX906-NEXT: v_writelane_b32 v63, s50, 6
-; GFX906-NEXT: v_writelane_b32 v63, s51, 7
-; GFX906-NEXT: v_writelane_b32 v63, s52, 8
-; GFX906-NEXT: v_writelane_b32 v63, s53, 9
-; GFX906-NEXT: v_writelane_b32 v63, s54, 10
-; GFX906-NEXT: v_writelane_b32 v63, s55, 11
-; GFX906-NEXT: v_writelane_b32 v63, s64, 12
-; GFX906-NEXT: v_writelane_b32 v63, s65, 13
-; GFX906-NEXT: v_writelane_b32 v63, s66, 14
-; GFX906-NEXT: v_writelane_b32 v63, s67, 15
; GFX906-NEXT: v_mov_b32_e32 v33, v30
; GFX906-NEXT: v_mov_b32_e32 v34, v29
; GFX906-NEXT: v_mov_b32_e32 v35, v28
@@ -249,38 +159,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX906-NEXT: v_mov_b32_e32 v61, v2
; GFX906-NEXT: v_mov_b32_e32 v62, v1
; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
-; GFX906-NEXT: ; implicit-def: $sgpr4
; GFX906-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
; GFX906-NEXT: v_mov_b32_e32 v1, v62
; GFX906-NEXT: v_mov_b32_e32 v2, v61
@@ -313,24 +191,7 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX906-NEXT: v_mov_b32_e32 v29, v34
; GFX906-NEXT: v_mov_b32_e32 v30, v33
; GFX906-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec
-; GFX906-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
; GFX906-NEXT: v_mov_b32_e32 v0, 0
-; GFX906-NEXT: v_readlane_b32 s67, v63, 15
-; GFX906-NEXT: v_readlane_b32 s66, v63, 14
-; GFX906-NEXT: v_readlane_b32 s65, v63, 13
-; GFX906-NEXT: v_readlane_b32 s64, v63, 12
-; GFX906-NEXT: v_readlane_b32 s55, v63, 11
-; GFX906-NEXT: v_readlane_b32 s54, v63, 10
-; GFX906-NEXT: v_readlane_b32 s53, v63, 9
-; GFX906-NEXT: v_readlane_b32 s52, v63, 8
-; GFX906-NEXT: v_readlane_b32 s51, v63, 7
-; GFX906-NEXT: v_readlane_b32 s50, v63, 6
-; GFX906-NEXT: v_readlane_b32 s49, v63, 5
-; GFX906-NEXT: v_readlane_b32 s48, v63, 4
-; GFX906-NEXT: v_readlane_b32 s39, v63, 3
-; GFX906-NEXT: v_readlane_b32 s38, v63, 2
-; GFX906-NEXT: v_readlane_b32 s37, v63, 1
-; GFX906-NEXT: v_readlane_b32 s36, v63, 0
; GFX906-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -346,18 +207,12 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX906-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX906-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT: s_mov_b64 exec, s[4:5]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: test_tuple:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX908-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
@@ -372,22 +227,7 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX908-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
-; GFX908-NEXT: v_writelane_b32 v62, s36, 0
-; GFX908-NEXT: v_writelane_b32 v62, s37, 1
-; GFX908-NEXT: v_writelane_b32 v62, s38, 2
-; GFX908-NEXT: v_writelane_b32 v62, s39, 3
-; GFX908-NEXT: v_writelane_b32 v62, s48, 4
-; GFX908-NEXT: v_writelane_b32 v62, s49, 5
-; GFX908-NEXT: v_writelane_b32 v62, s50, 6
-; GFX908-NEXT: v_writelane_b32 v62, s51, 7
-; GFX908-NEXT: v_writelane_b32 v62, s52, 8
-; GFX908-NEXT: v_writelane_b32 v62, s53, 9
-; GFX908-NEXT: v_writelane_b32 v62, s54, 10
-; GFX908-NEXT: v_writelane_b32 v62, s55, 11
-; GFX908-NEXT: v_writelane_b32 v62, s64, 12
-; GFX908-NEXT: v_writelane_b32 v62, s65, 13
-; GFX908-NEXT: v_writelane_b32 v62, s66, 14
-; GFX908-NEXT: v_writelane_b32 v62, s67, 15
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
; GFX908-NEXT: v_mov_b32_e32 v33, v30
; GFX908-NEXT: v_mov_b32_e32 v34, v29
; GFX908-NEXT: v_mov_b32_e32 v35, v28
@@ -417,46 +257,10 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX908-NEXT: v_mov_b32_e32 v59, v4
; GFX908-NEXT: v_mov_b32_e32 v60, v3
; GFX908-NEXT: v_mov_b32_e32 v61, v2
-; GFX908-NEXT: v_mov_b32_e32 v32, v1
-; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s32
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_accvgpr_write_b32 a14, v1 ; Reload Reuse
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
-; GFX908-NEXT: ; implicit-def: $sgpr4
+; GFX908-NEXT: v_mov_b32_e32 v62, v1
+; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s32
; GFX908-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX908-NEXT: v_mov_b32_e32 v1, v32
-; GFX908-NEXT: s_nop 1
-; GFX908-NEXT: v_accvgpr_read_b32 v32, a14 ; Reload Reuse
+; GFX908-NEXT: v_mov_b32_e32 v1, v62
; GFX908-NEXT: v_mov_b32_e32 v2, v61
; GFX908-NEXT: v_mov_b32_e32 v3, v60
; GFX908-NEXT: v_mov_b32_e32 v4, v59
@@ -487,24 +291,8 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX908-NEXT: v_mov_b32_e32 v29, v34
; GFX908-NEXT: v_mov_b32_e32 v30, v33
; GFX908-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec
-; GFX908-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
; GFX908-NEXT: v_mov_b32_e32 v0, 0
-; GFX908-NEXT: v_readlane_b32 s67, v62, 15
-; GFX908-NEXT: v_readlane_b32 s66, v62, 14
-; GFX908-NEXT: v_readlane_b32 s65, v62, 13
-; GFX908-NEXT: v_readlane_b32 s64, v62, 12
-; GFX908-NEXT: v_readlane_b32 s55, v62, 11
-; GFX908-NEXT: v_readlane_b32 s54, v62, 10
-; GFX908-NEXT: v_readlane_b32 s53, v62, 9
-; GFX908-NEXT: v_readlane_b32 s52, v62, 8
-; GFX908-NEXT: v_readlane_b32 s51, v62, 7
-; GFX908-NEXT: v_readlane_b32 s50, v62, 6
-; GFX908-NEXT: v_readlane_b32 s49, v62, 5
-; GFX908-NEXT: v_readlane_b32 s48, v62, 4
-; GFX908-NEXT: v_readlane_b32 s39, v62, 3
-; GFX908-NEXT: v_readlane_b32 s38, v62, 2
-; GFX908-NEXT: v_readlane_b32 s37, v62, 1
-; GFX908-NEXT: v_readlane_b32 s36, v62, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
@@ -519,18 +307,12 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX908-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
; GFX908-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX908-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX908-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX908-NEXT: s_mov_b64 exec, s[4:5]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX90a-LABEL: test_tuple:
; GFX90a: ; %bb.0:
; GFX90a-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90a-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX90a-NEXT: s_mov_b64 exec, s[4:5]
; GFX90a-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
@@ -546,22 +328,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX90a-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX90a-NEXT: v_writelane_b32 v63, s36, 0
-; GFX90a-NEXT: v_writelane_b32 v63, s37, 1
-; GFX90a-NEXT: v_writelane_b32 v63, s38, 2
-; GFX90a-NEXT: v_writelane_b32 v63, s39, 3
-; GFX90a-NEXT: v_writelane_b32 v63, s48, 4
-; GFX90a-NEXT: v_writelane_b32 v63, s49, 5
-; GFX90a-NEXT: v_writelane_b32 v63, s50, 6
-; GFX90a-NEXT: v_writelane_b32 v63, s51, 7
-; GFX90a-NEXT: v_writelane_b32 v63, s52, 8
-; GFX90a-NEXT: v_writelane_b32 v63, s53, 9
-; GFX90a-NEXT: v_writelane_b32 v63, s54, 10
-; GFX90a-NEXT: v_writelane_b32 v63, s55, 11
-; GFX90a-NEXT: v_writelane_b32 v63, s64, 12
-; GFX90a-NEXT: v_writelane_b32 v63, s65, 13
-; GFX90a-NEXT: v_writelane_b32 v63, s66, 14
-; GFX90a-NEXT: v_writelane_b32 v63, s67, 15
; GFX90a-NEXT: v_mov_b32_e32 v33, v30
; GFX90a-NEXT: v_mov_b32_e32 v34, v29
; GFX90a-NEXT: v_mov_b32_e32 v35, v28
@@ -593,38 +359,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX90a-NEXT: v_mov_b32_e32 v61, v2
; GFX90a-NEXT: v_mov_b32_e32 v62, v1
; GFX90a-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
-; GFX90a-NEXT: ; implicit-def: $sgpr4
; GFX90a-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
; GFX90a-NEXT: v_mov_b32_e32 v1, v62
; GFX90a-NEXT: v_mov_b32_e32 v2, v61
@@ -657,24 +391,7 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX90a-NEXT: v_mov_b32_e32 v29, v34
; GFX90a-NEXT: v_mov_b32_e32 v30, v33
; GFX90a-NEXT: ; kill: def $vgpr31 killed $vgpr32 killed $exec
-; GFX90a-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_readlane_b32 s67, v63, 15
-; GFX90a-NEXT: v_readlane_b32 s66, v63, 14
-; GFX90a-NEXT: v_readlane_b32 s65, v63, 13
-; GFX90a-NEXT: v_readlane_b32 s64, v63, 12
-; GFX90a-NEXT: v_readlane_b32 s55, v63, 11
-; GFX90a-NEXT: v_readlane_b32 s54, v63, 10
-; GFX90a-NEXT: v_readlane_b32 s53, v63, 9
-; GFX90a-NEXT: v_readlane_b32 s52, v63, 8
-; GFX90a-NEXT: v_readlane_b32 s51, v63, 7
-; GFX90a-NEXT: v_readlane_b32 s50, v63, 6
-; GFX90a-NEXT: v_readlane_b32 s49, v63, 5
-; GFX90a-NEXT: v_readlane_b32 s48, v63, 4
-; GFX90a-NEXT: v_readlane_b32 s39, v63, 3
-; GFX90a-NEXT: v_readlane_b32 s38, v63, 2
-; GFX90a-NEXT: v_readlane_b32 s37, v63, 1
-; GFX90a-NEXT: v_readlane_b32 s36, v63, 0
; GFX90a-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
@@ -690,9 +407,6 @@ define i32 @test_tuple(<16 x i64> %0) {
; GFX90a-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
; GFX90a-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX90a-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX90a-NEXT: s_mov_b64 exec, s[4:5]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_setpc_b64 s[30:31]
%2 = shufflevector <16 x i64> %0, <16 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 9dad99f3..fa4461c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -135,3 +135,48 @@ body: |
S_ENDPGM 0, implicit %4
...
+
+---
+name: av_mov_imm_b64
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: av_mov_imm_b64
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0_sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0_sub1:vreg_192 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]].sub2_sub3:vreg_192 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
+ ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]]
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]].sub0_sub1:vreg_192 = V_MUL_F64_e64 0, [[AV_MOV_]].sub0_sub1, 0, [[AV_MOV_]].sub0_sub1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]].sub2_sub3:vreg_192 = V_MUL_F64_e64 0, [[AV_MOV_]].sub2_sub3, 0, [[AV_MOV_]].sub2_sub3, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[AV_MOV_]]
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ %0:av_64 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+ %1:vreg_64 = COPY %0
+ %2:vreg_64 = COPY %0
+ %3:sreg_64 = COPY $sgpr0_sgpr1
+ $exec = S_MOV_B64_term %3:sreg_64
+ S_CBRANCH_EXECZ %bb.2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ %1:vreg_64 = V_MUL_F64_e64 0, %1:vreg_64, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
+ %2:vreg_64 = V_MUL_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
+
+ bb.2:
+ undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64
+ %4.sub2_sub3:vreg_192 = COPY %2:vreg_64
+ S_ENDPGM 0, implicit %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 9ed5332..c40ba2b 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-NEXT: global_load_ushort v2, v0, s[4:5] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4
; CHECK-NEXT: v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
index 77dc32d..2d7a91f 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: intrinsic_store_system_scope:
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
index acf8bd3..1e9d4dd 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer %s -o - | FileCheck --check-prefix=GFX1200 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=si-memory-legalizer %s -o - | FileCheck --check-prefix=GFX1250 %s
---
name: intrinsic_store_system_scope
@@ -7,17 +8,23 @@ body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-LABEL: name: intrinsic_store_system_scope
- ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
- ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
- ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
- ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
- ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
- ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
- ; GFX12-NEXT: S_ENDPGM 0
- BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-LABEL: name: intrinsic_store_system_scope
+ ; GFX1200: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_KMCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1200-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: intrinsic_store_system_scope
+ ; GFX1250: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+ ; GFX1250-NEXT: S_ENDPGM 0
+ BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
S_ENDPGM 0
...
@@ -27,17 +34,24 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX12-LABEL: name: generic_store_volatile
- ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
- ; GFX12-NEXT: {{ $}}
- ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
- ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
- ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
- ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
- ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
- ; GFX12-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
- ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
- ; GFX12-NEXT: S_ENDPGM 0
- GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+ ; GFX1200-LABEL: name: generic_store_volatile
+ ; GFX1200: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: S_WAIT_LOADCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_SAMPLECNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_BVHCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_KMCNT_soft 0
+ ; GFX1200-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1200-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr2_vgpr3, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
+ ; GFX1200-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1200-NEXT: S_ENDPGM 0
+ ;
+ ; GFX1250-LABEL: name: generic_store_volatile
+ ; GFX1250: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr2_vgpr3, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
+ ; GFX1250-NEXT: S_WAIT_STORECNT_soft 0
+ ; GFX1250-NEXT: S_ENDPGM 0
+ GLOBAL_STORE_DWORD killed renamable $vgpr2_vgpr3, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
S_ENDPGM 0
...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
index 292091a..bb47392 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -62,7 +62,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: S_WAITCNT 0
; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
- ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX11-NEXT: S_BARRIER
; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX11-NEXT: S_WAITCNT 7
@@ -176,7 +176,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: S_WAITCNT 0
; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
- ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+ ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
; GFX11-NEXT: S_BARRIER
; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX11-NEXT: S_WAITCNT 7
diff --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index 76c331c..e2ef60b 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -1,6 +1,9 @@
-; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
; This compute shader has input args that claim that it has 17 sgprs and 5 vgprs
; in wave dispatch. Ensure that the sgpr and vgpr counts in COMPUTE_PGM_RSRC1
@@ -17,7 +20,7 @@
; GCN-NEXT: .scratch_memory_size: 0
; SI-NEXT: .sgpr_count: 0x11
; VI-NEXT: .sgpr_count: 0x60
-; GFX9-NEXT: .sgpr_count: 0x11
+; GFX9-NEXT: .sgpr_count: 0x15
; SI-NEXT: .vgpr_count: 0x5
; VI-NEXT: .vgpr_count: 0x5
; GFX9-NEXT: .vgpr_count: 0x5
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index a13a68a..36e8adb 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -2412,3 +2412,1427 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
%ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent
ret <2 x half> %ret
}
+
+declare amdgpu_gfx_whole_wave float @callee(i1 %active, <8 x float> %x)
+
+define amdgpu_cs void @call_from_entry(<8 x float> %x, ptr %p) {
+; DAGISEL-LABEL: call_from_entry:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_mov_b32 s1, callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, callee@abs32@lo
+; DAGISEL-NEXT: s_mov_b32 s32, 0
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_from_entry:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: flat_store_b32 v[40:41], v0
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL64-LABEL: call_from_entry:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_mov_b32 s1, callee@abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, callee@abs32@lo
+; DAGISEL64-NEXT: s_mov_b32 s32, 0
+; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
+; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL64-NEXT: s_endpgm
+;
+; GISEL64-LABEL: call_from_entry:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_mov_b32 s0, callee@abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, callee@abs32@hi
+; GISEL64-NEXT: s_mov_b32 s32, 0
+; GISEL64-NEXT: v_mov_b32_e32 v40, v8
+; GISEL64-NEXT: v_mov_b32_e32 v41, v9
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: flat_store_b32 v[40:41], v0
+; GISEL64-NEXT: s_endpgm
+ %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
+ store float %ret, ptr %p
+ ret void
+}
+
+define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> %x, ptr %p) {
+; DAGISEL-LABEL: call_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_mov_b32 s0, s33
+; DAGISEL-NEXT: s_mov_b32 s33, s32
+; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; DAGISEL-NEXT: s_mov_b32 exec_lo, -1
+; DAGISEL-NEXT: s_clause 0x2
+; DAGISEL-NEXT: scratch_store_b32 off, v42, s33
+; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: v_writelane_b32 v42, s0, 3
+; DAGISEL-NEXT: s_mov_b32 s1, callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, callee@abs32@lo
+; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL-NEXT: v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
+; DAGISEL-NEXT: v_writelane_b32 v42, s4, 0
+; DAGISEL-NEXT: v_writelane_b32 v42, s30, 1
+; DAGISEL-NEXT: v_writelane_b32 v42, s31, 2
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2
+; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1
+; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0
+; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3
+; DAGISEL-NEXT: s_clause 0x2
+; DAGISEL-NEXT: scratch_load_b32 v42, off, s33
+; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; DAGISEL-NEXT: s_mov_b32 s32, s33
+; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; DAGISEL-NEXT: s_clause 0xf
+; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; DAGISEL-NEXT: s_mov_b32 exec_lo, s4
+; DAGISEL-NEXT: s_mov_b32 s33, s0
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: call_from_whole_wave:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_mov_b32 s0, s33
+; GISEL-NEXT: s_mov_b32 s33, s32
+; GISEL-NEXT: s_xor_saveexec_b32 s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; GISEL-NEXT: s_mov_b32 exec_lo, -1
+; GISEL-NEXT: s_clause 0x2
+; GISEL-NEXT: scratch_store_b32 off, v42, s33
+; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; GISEL-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: v_writelane_b32 v42, s0, 3
+; GISEL-NEXT: s_mov_b32 s0, callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, callee@abs32@hi
+; GISEL-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v41, v9
+; GISEL-NEXT: v_writelane_b32 v42, s4, 0
+; GISEL-NEXT: v_writelane_b32 v42, s30, 1
+; GISEL-NEXT: v_writelane_b32 v42, s31, 2
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: flat_store_b32 v[40:41], v0
+; GISEL-NEXT: v_readlane_b32 s31, v42, 2
+; GISEL-NEXT: v_readlane_b32 s30, v42, 1
+; GISEL-NEXT: v_readlane_b32 s4, v42, 0
+; GISEL-NEXT: v_readlane_b32 s0, v42, 3
+; GISEL-NEXT: s_clause 0x2
+; GISEL-NEXT: scratch_load_b32 v42, off, s33
+; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; GISEL-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; GISEL-NEXT: s_mov_b32 s32, s33
+; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; GISEL-NEXT: s_clause 0x1f
+; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; GISEL-NEXT: s_clause 0xf
+; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; GISEL-NEXT: s_mov_b32 exec_lo, s4
+; GISEL-NEXT: s_mov_b32 s33, s0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: call_from_whole_wave:
+; DAGISEL64: ; %bb.0:
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_expcnt 0x0
+; DAGISEL64-NEXT: s_wait_samplecnt 0x0
+; DAGISEL64-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT: s_wait_kmcnt 0x0
+; DAGISEL64-NEXT: s_mov_b32 s0, s33
+; DAGISEL64-NEXT: s_mov_b32 s33, s32
+; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; DAGISEL64-NEXT: s_mov_b64 exec, -1
+; DAGISEL64-NEXT: s_clause 0x2
+; DAGISEL64-NEXT: scratch_store_b32 off, v42, s33
+; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; DAGISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: v_writelane_b32 v42, s0, 4
+; DAGISEL64-NEXT: s_mov_b32 s1, callee@abs32@hi
+; DAGISEL64-NEXT: s_mov_b32 s0, callee@abs32@lo
+; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; DAGISEL64-NEXT: v_mov_b32_e32 v41, v9
+; DAGISEL64-NEXT: v_writelane_b32 v42, s4, 0
+; DAGISEL64-NEXT: v_mov_b32_e32 v40, v8
+; DAGISEL64-NEXT: v_writelane_b32 v42, s5, 1
+; DAGISEL64-NEXT: v_writelane_b32 v42, s30, 2
+; DAGISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0
+; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3
+; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1
+; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0
+; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4
+; DAGISEL64-NEXT: s_clause 0x2
+; DAGISEL64-NEXT: scratch_load_b32 v42, off, s33
+; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; DAGISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; DAGISEL64-NEXT: s_mov_b32 s32, s33
+; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; DAGISEL64-NEXT: s_clause 0x1f
+; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; DAGISEL64-NEXT: s_clause 0xf
+; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; DAGISEL64-NEXT: s_mov_b32 s33, s0
+; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT: s_wait_alu 0xfffe
+; DAGISEL64-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: call_from_whole_wave:
+; GISEL64: ; %bb.0:
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_expcnt 0x0
+; GISEL64-NEXT: s_wait_samplecnt 0x0
+; GISEL64-NEXT: s_wait_bvhcnt 0x0
+; GISEL64-NEXT: s_wait_kmcnt 0x0
+; GISEL64-NEXT: s_mov_b32 s0, s33
+; GISEL64-NEXT: s_mov_b32 s33, s32
+; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4
+; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8
+; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12
+; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16
+; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20
+; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24
+; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28
+; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32
+; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36
+; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40
+; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44
+; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48
+; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52
+; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56
+; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60
+; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64
+; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68
+; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72
+; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76
+; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80
+; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84
+; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88
+; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92
+; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96
+; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100
+; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104
+; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108
+; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112
+; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116
+; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120
+; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124
+; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132
+; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136
+; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140
+; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144
+; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148
+; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152
+; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156
+; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160
+; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:172
+; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:176
+; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:180
+; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:184
+; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:188
+; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:192
+; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:196
+; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:200
+; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:204
+; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:208
+; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:212
+; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:216
+; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:220
+; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:224
+; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:228
+; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:232
+; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:236
+; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:240
+; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:244
+; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:248
+; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:252
+; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:256
+; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:260
+; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:264
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:268
+; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:272
+; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:276
+; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:280
+; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:284
+; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:288
+; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:292
+; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:296
+; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:300
+; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:304
+; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:308
+; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:312
+; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:316
+; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:320
+; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:324
+; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:328
+; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:332
+; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:336
+; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:340
+; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:344
+; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:348
+; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:352
+; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:356
+; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:360
+; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:364
+; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:368
+; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:372
+; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:376
+; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:380
+; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:384
+; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:388
+; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:392
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:396
+; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:400
+; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:404
+; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:408
+; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:412
+; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:416
+; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:420
+; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:424
+; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:428
+; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:432
+; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:436
+; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:440
+; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:444
+; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:448
+; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:452
+; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:456
+; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:460
+; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:464
+; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:468
+; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:472
+; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:476
+; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:480
+; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:484
+; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:488
+; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:492
+; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:496
+; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:500
+; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:504
+; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:508
+; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:512
+; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:516
+; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:520
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:524
+; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:528
+; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:532
+; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:536
+; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:540
+; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:544
+; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:548
+; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:552
+; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:556
+; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:560
+; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:564
+; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:568
+; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:572
+; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:576
+; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:580
+; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:584
+; GISEL64-NEXT: s_mov_b64 exec, -1
+; GISEL64-NEXT: s_clause 0x2
+; GISEL64-NEXT: scratch_store_b32 off, v42, s33
+; GISEL64-NEXT: scratch_store_b32 off, v40, s33 offset:164
+; GISEL64-NEXT: scratch_store_b32 off, v41, s33 offset:168
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: v_writelane_b32 v42, s0, 4
+; GISEL64-NEXT: s_mov_b32 s0, callee@abs32@lo
+; GISEL64-NEXT: s_mov_b32 s1, callee@abs32@hi
+; GISEL64-NEXT: s_addk_co_i32 s32, 0x250
+; GISEL64-NEXT: v_mov_b32_e32 v40, v8
+; GISEL64-NEXT: v_writelane_b32 v42, s4, 0
+; GISEL64-NEXT: v_mov_b32_e32 v41, v9
+; GISEL64-NEXT: v_writelane_b32 v42, s5, 1
+; GISEL64-NEXT: v_writelane_b32 v42, s30, 2
+; GISEL64-NEXT: v_writelane_b32 v42, s31, 3
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL64-NEXT: flat_store_b32 v[40:41], v0
+; GISEL64-NEXT: v_readlane_b32 s31, v42, 3
+; GISEL64-NEXT: v_readlane_b32 s30, v42, 2
+; GISEL64-NEXT: v_readlane_b32 s5, v42, 1
+; GISEL64-NEXT: v_readlane_b32 s4, v42, 0
+; GISEL64-NEXT: v_readlane_b32 s0, v42, 4
+; GISEL64-NEXT: s_clause 0x2
+; GISEL64-NEXT: scratch_load_b32 v42, off, s33
+; GISEL64-NEXT: scratch_load_b32 v40, off, s33 offset:164
+; GISEL64-NEXT: scratch_load_b32 v41, off, s33 offset:168
+; GISEL64-NEXT: s_mov_b32 s32, s33
+; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4
+; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8
+; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12
+; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16
+; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20
+; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24
+; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28
+; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32
+; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36
+; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40
+; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44
+; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48
+; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52
+; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56
+; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60
+; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64
+; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68
+; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72
+; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76
+; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80
+; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84
+; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88
+; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92
+; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96
+; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100
+; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104
+; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108
+; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112
+; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116
+; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120
+; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124
+; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132
+; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136
+; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140
+; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144
+; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148
+; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152
+; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156
+; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160
+; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:172
+; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:176
+; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:180
+; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:184
+; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:188
+; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:192
+; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:196
+; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:200
+; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:204
+; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:208
+; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:212
+; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:216
+; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:220
+; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:224
+; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:228
+; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:232
+; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:236
+; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:240
+; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:244
+; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:248
+; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:252
+; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:256
+; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:260
+; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:264
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:268
+; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:272
+; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:276
+; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:280
+; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:284
+; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:288
+; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:292
+; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:296
+; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:300
+; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:304
+; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:308
+; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:312
+; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:316
+; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:320
+; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:324
+; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:328
+; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:332
+; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:336
+; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:340
+; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:344
+; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:348
+; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:352
+; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:356
+; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:360
+; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:364
+; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:368
+; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:372
+; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:376
+; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:380
+; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:384
+; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:388
+; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:392
+; GISEL64-NEXT: s_clause 0x1f
+; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:396
+; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:400
+; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:404
+; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:408
+; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:412
+; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:416
+; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:420
+; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:424
+; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:428
+; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:432
+; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:436
+; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:440
+; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:444
+; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:448
+; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:452
+; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:456
+; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:460
+; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:464
+; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:468
+; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:472
+; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:476
+; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:480
+; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:484
+; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:488
+; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:492
+; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:496
+; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:500
+; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:504
+; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:508
+; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:512
+; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:516
+; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:520
+; GISEL64-NEXT: s_clause 0xf
+; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:524
+; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:528
+; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:532
+; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:536
+; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:540
+; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:544
+; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:548
+; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:552
+; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:556
+; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:560
+; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:564
+; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:568
+; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:572
+; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:576
+; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:580
+; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:584
+; GISEL64-NEXT: s_mov_b64 exec, s[4:5]
+; GISEL64-NEXT: s_mov_b32 s33, s0
+; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT: s_wait_alu 0xfffe
+; GISEL64-NEXT: s_setpc_b64 s[30:31]
+ %ret = call float(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, <8 x float> %x) convergent
+ store float %ret, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 350b233..ceb1b3e 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-wqm -o - %s | FileCheck %s
@@ -46,10 +47,6 @@
---
# Check for awareness that s_or_saveexec_b64 clobbers SCC
-#
-#CHECK: ENTER_STRICT_WWM
-#CHECK: S_CMP_LT_I32
-#CHECK: S_CSELECT_B32
name: test_strict_wwm_scc
alignment: 1
exposesReturnsTwice: false
@@ -80,6 +77,21 @@ body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0
+ ; CHECK-LABEL: name: test_strict_wwm_scc
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: S_CMP_LT_I32 0, [[COPY3]], implicit-def $scc
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], [[COPY]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sgpr_32 = S_CSELECT_B32 [[COPY1]], [[COPY2]], implicit $scc
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_CSELECT_B32_]], [[V_ADD_CO_U32_e32_]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+ ; CHECK-NEXT: early-clobber $vgpr0 = V_MOV_B32_e32 [[V_ADD_CO_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%3 = COPY $vgpr0
%2 = COPY $sgpr2
%1 = COPY $sgpr1
@@ -96,16 +108,35 @@ body: |
---
# Second test for awareness that s_or_saveexec_b64 clobbers SCC
# Because entry block is treated differently.
-#
-#CHECK: %bb.1
-#CHECK: S_CMP_LT_I32
-#CHECK: COPY $scc
-#CHECK: ENTER_STRICT_WWM
-#CHECK: $scc = COPY
-#CHECK: S_CSELECT_B32
name: test_strict_wwm_scc2
tracksRegLiveness: true
body: |
+ ; CHECK-LABEL: name: test_strict_wwm_scc2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: S_CMP_LT_I32 0, [[COPY3]], implicit-def $scc
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], [[DEF]], 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0 = COPY $scc
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM1:%[0-9]+]]:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: $scc = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], [[COPY]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sgpr_32 = S_CSELECT_B32 [[COPY1]], [[COPY2]], implicit $scc
+ ; CHECK-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_CSELECT_B32_]], [[V_ADD_CO_U32_e32_]], implicit-def $vcc, implicit $exec
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM1]]
+ ; CHECK-NEXT: early-clobber $vgpr0 = V_MOV_B32_e32 [[V_ADD_CO_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: $vgpr1 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr0
@@ -130,7 +161,6 @@ body: |
---
# V_SET_INACTIVE, when its second operand is undef, is replaced by a
# COPY by si-wqm. Ensure the instruction is removed.
-#CHECK-NOT: V_SET_INACTIVE
name: no_cfg
alignment: 1
exposesReturnsTwice: false
@@ -167,6 +197,28 @@ body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-LABEL: name: no_cfg
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
+ ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]], implicit $exec, implicit-def $scc
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+ ; CHECK-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[COPY7]], 323, 12, 15, 0, implicit $exec
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+ ; CHECK-NEXT: early-clobber %15:vgpr_32 = V_MOV_B32_e32 [[V_MOV_B32_dpp]], implicit $exec
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET_exact %15, [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
%3:sgpr_32 = COPY $sgpr3
%2:sgpr_32 = COPY $sgpr2
%1:sgpr_32 = COPY $sgpr1
@@ -189,18 +241,32 @@ body: |
---
# Ensure that strict_wwm is not put around an EXEC copy
-#CHECK-LABEL: name: copy_exec
-#CHECK: %7:sreg_64 = COPY $exec
-#CHECK-NEXT: %13:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
-#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-#CHECK-NEXT: $exec = EXIT_STRICT_WWM %13
-#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec
name: copy_exec
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-LABEL: name: copy_exec
+ ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: dead [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+ ; CHECK-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]].sub0, 0, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_MBCNT_LO_U32_B32_e64_]], 312, 15, 15, 0, implicit $exec
+ ; CHECK-NEXT: dead [[V_READLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READLANE_B32 [[V_MOV_B32_dpp]], 63
+ ; CHECK-NEXT: early-clobber %12:vgpr_32 = V_MOV_B32_e32 [[V_MOV_B32_e32_]], implicit $exec
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET_exact %12, [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
%3:sgpr_32 = COPY $sgpr3
%2:sgpr_32 = COPY $sgpr2
%1:sgpr_32 = COPY $sgpr1
@@ -224,20 +290,48 @@ body: |
---
# Check exit of WQM is still inserted correctly when SCC is live until block end.
# Critially this tests that compilation does not fail.
-#CHECK-LABEL: name: scc_always_live
-#CHECK: %8:vreg_128 = IMAGE_SAMPLE_V4_V2 %7
-#CHECK-NEXT: S_CMP_EQ_U32 %2, 0, implicit-def $scc
-#CHECK-NEXT: undef %9.sub0:vreg_64 = nsz arcp nofpexcept V_ADD_F32_e64
-#CHECK-NEXT: %9.sub1:vreg_64 = nsz arcp nofpexcept V_MUL_F32_e32
-#CHECK-NEXT: %14:sreg_32_xm0 = COPY $scc
-#CHECK-NEXT: $exec = S_AND_B64 $exec, %13, implicit-def $scc
-#CHECK-NEXT: $scc = COPY %14
-#CHECK-NEXT: %10:vgpr_32 = nsz arcp nofpexcept V_ADD_F32_e64
-#CHECK-NEXT: %11:vreg_128 = IMAGE_SAMPLE_V4_V2
-#CHECK-NEXT: S_CBRANCH_SCC0 %bb.2
name: scc_always_live
tracksRegLiveness: true
body: |
+ ; CHECK-LABEL: name: scc_always_live
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr1, $sgpr2, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec
+ ; CHECK-NEXT: $m0 = COPY $sgpr1
+ ; CHECK-NEXT: $exec = S_WQM_B64 $exec, implicit-def $scc
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_INTERP_P1_F32_:%[0-9]+]]:vgpr_32 = V_INTERP_P1_F32 [[COPY1]], 3, 2, implicit $mode, implicit $m0, implicit $exec
+ ; CHECK-NEXT: [[V_INTERP_P1_F32_1:%[0-9]+]]:vgpr_32 = V_INTERP_P1_F32 [[COPY2]], 3, 2, implicit $mode, implicit $m0, implicit $exec
+ ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY [[V_INTERP_P1_F32_]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64 = COPY [[V_INTERP_P1_F32_1]]
+ ; CHECK-NEXT: [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY4]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+ ; CHECK-NEXT: S_CMP_EQ_U32 [[COPY3]], 0, implicit-def $scc
+ ; CHECK-NEXT: undef [[V_ADD_F32_e64_:%[0-9]+]].sub0:vreg_64 = nsz arcp nofpexcept V_ADD_F32_e64 0, [[IMAGE_SAMPLE_V4_V2_]].sub0, 0, [[V_INTERP_P1_F32_1]], 1, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]].sub1:vreg_64 = nsz arcp nofpexcept V_MUL_F32_e32 [[V_INTERP_P1_F32_]], [[V_INTERP_P1_F32_1]], implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0 = COPY $scc
+ ; CHECK-NEXT: $exec = S_AND_B64 $exec, [[COPY]], implicit-def $scc
+ ; CHECK-NEXT: $scc = COPY [[COPY5]]
+ ; CHECK-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nsz arcp nofpexcept V_ADD_F32_e64 0, [[V_INTERP_P1_F32_]], 0, [[V_INTERP_P1_F32_1]], 1, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[IMAGE_SAMPLE_V4_V2_1:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[V_ADD_F32_e64_]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[V_ADD_F32_e64_1]], [[DEF1]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V4_V2_]].sub0
+ ; CHECK-NEXT: $vgpr1 = COPY [[IMAGE_SAMPLE_V4_V2_]].sub1
+ ; CHECK-NEXT: $vgpr2 = COPY [[IMAGE_SAMPLE_V4_V2_1]].sub0
+ ; CHECK-NEXT: $vgpr3 = COPY [[IMAGE_SAMPLE_V4_V2_1]].sub1
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
bb.0:
liveins: $sgpr1, $sgpr2, $vgpr1, $vgpr2
@@ -281,18 +375,26 @@ body: |
---
# Check that unnecessary instruction do not get marked for WWM
#
-#CHECK-NOT: ENTER_STRICT_WWM
-#CHECK: BUFFER_LOAD_DWORDX2
-#CHECK: ENTER_STRICT_WWM
-#CHECK: V_SET_INACTIVE_B32
-#CHECK: V_SET_INACTIVE_B32
-#CHECK-NOT: ENTER_STRICT_WWM
-#CHECK: V_MAX
name: test_wwm_set_inactive_propagation
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; CHECK-LABEL: name: test_wwm_set_inactive_propagation
+ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_64_xexec = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
+ ; CHECK-NEXT: dead [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]].sub0:vreg_64 = V_SET_INACTIVE_B32 0, [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0, 0, 0, undef [[ENTER_STRICT_WWM]], implicit $exec, implicit-def $scc
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]].sub1:vreg_64 = V_SET_INACTIVE_B32 0, [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1, 0, 0, undef [[ENTER_STRICT_WWM]], implicit $exec, implicit-def $scc
+ ; CHECK-NEXT: [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, [[BUFFER_LOAD_DWORDX2_OFFEN]], 0, [[BUFFER_LOAD_DWORDX2_OFFEN]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $exec = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
+ ; CHECK-NEXT: early-clobber $vgpr0 = V_MOV_B32_e32 [[V_MAX_F64_e64_]].sub0, implicit $exec
+ ; CHECK-NEXT: early-clobber $vgpr1 = V_MOV_B32_e32 [[V_MAX_F64_e64_]].sub1, implicit $exec
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:vgpr_32 = COPY $vgpr0
%2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec
@@ -308,15 +410,46 @@ body: |
---
# Check that WQM marking occurs correctly through phi nodes in live range graph.
# If not then initial V_MOV will not be in WQM.
-#
-#CHECK-LABEL: name: test_wqm_lr_phi
-#CHECK: COPY $exec
-#CHECK-NEXT: S_WQM
-#CHECK-NEXT: V_MOV_B32_e32 -10
-#CHECK-NEXT: V_MOV_B32_e32 0
name: test_wqm_lr_phi
tracksRegLiveness: true
body: |
+ ; CHECK-LABEL: name: test_wqm_lr_phi
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $exec
+ ; CHECK-NEXT: $exec = S_WQM_B64 $exec, implicit-def $scc
+ ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[S_GETPC_B64_:%[0-9]+]]:sreg_64 = S_GETPC_B64
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[S_GETPC_B64_]], 32, 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec
+ ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_ADD_U32_e32 1, [[V_MOV_B32_e32_]].sub1, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_U32_e32 1, [[V_MOV_B32_e32_]].sub1, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: $exec = S_AND_B64 $exec, [[COPY]], implicit-def $scc
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX8_IMM]], [[DEF]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+ ; CHECK-NEXT: $vgpr0 = COPY [[IMAGE_SAMPLE_V4_V2_]].sub0
+ ; CHECK-NEXT: $vgpr1 = COPY [[IMAGE_SAMPLE_V4_V2_]].sub1
+ ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
bb.0:
undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
%0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
@@ -345,14 +478,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_cs
-#CHECK-NOT: S_WQM
name: no_wqm_in_cs
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_cs
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
@@ -362,14 +501,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_es
-#CHECK-NOT: S_WQM
name: no_wqm_in_es
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_es
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
@@ -379,14 +524,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_gs
-#CHECK-NOT: S_WQM
name: no_wqm_in_gs
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_gs
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
@@ -396,14 +547,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_hs
-#CHECK-NOT: S_WQM
name: no_wqm_in_hs
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_hs
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
@@ -413,14 +570,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_ls
-#CHECK-NOT: S_WQM
name: no_wqm_in_ls
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_ls
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
@@ -430,14 +593,20 @@ body: |
...
---
-#CHECK-LABEL: name: no_wqm_in_vs
-#CHECK-NOT: S_WQM
name: no_wqm_in_vs
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: no_wqm_in_vs
+ ; CHECK: liveins: $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr2
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V4_V2_:%[0-9]+]]:vreg_128 = IMAGE_SAMPLE_V4_V2 [[COPY]], [[DEF]], [[DEF1]], 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
undef %0.sub0:vreg_64 = COPY $vgpr1
%0.sub1:vreg_64 = COPY $vgpr2
%100:sgpr_256 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index f63329b..74e9ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -165,7 +165,6 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[36:37]
@@ -450,11 +449,8 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
@@ -479,7 +475,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr36
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -487,7 +482,6 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s35, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
@@ -596,8 +590,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_writelane_b32 v11, s35, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: s_mov_b32 s34, 32
@@ -623,8 +615,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_readlane_b32 s39, v11, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr40
-; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4
@@ -758,8 +748,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr43
-; GFX9-O0-NEXT: ; implicit-def: $sgpr43
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
@@ -773,8 +761,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr43
-; GFX9-O0-NEXT: ; implicit-def: $sgpr43
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
@@ -791,8 +777,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[40:41]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
@@ -801,10 +785,6 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
@@ -899,60 +879,49 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v47, s64, 0
-; GFX9-O0-NEXT: v_writelane_b32 v47, s65, 1
-; GFX9-O0-NEXT: v_writelane_b32 v47, s66, 2
-; GFX9-O0-NEXT: v_writelane_b32 v47, s67, 3
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
@@ -977,15 +946,14 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v40, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v39, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v38, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v37, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v36, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, s18
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, s19
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, s20
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, s21
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, s22
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v47, s23
; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24
; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25
; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26
@@ -1028,22 +996,20 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35
; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v35
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v40
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v39
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v38
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v37
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v36
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v40
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v39
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v38
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v37
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v36
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v35
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v47
; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46
; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45
; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44
@@ -1061,51 +1027,35 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36
; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec
-; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0
@@ -1120,8 +1070,6 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v33
@@ -1137,8 +1085,6 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v33
@@ -1154,8 +1100,6 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v33
@@ -1171,8 +1115,6 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
-; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v33
@@ -1188,8 +1130,6 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v33
@@ -1240,24 +1180,19 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27
; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28
; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT: v_readlane_b32 s67, v47, 3
-; GFX9-O0-NEXT: v_readlane_b32 s66, v47, 2
-; GFX9-O0-NEXT: v_readlane_b32 s65, v47, 1
-; GFX9-O0-NEXT: v_readlane_b32 s64, v47, 0
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 7dd03ad..6347a37 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -142,7 +142,6 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3]
@@ -454,11 +453,8 @@ define i64 @called_i64(i64 %a) noinline {
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
@@ -483,7 +479,6 @@ define i64 @called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -491,7 +486,6 @@ define i64 @called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s5, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
@@ -597,8 +591,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
; GFX9-O0-NEXT: s_mov_b32 s2, 32
@@ -638,8 +630,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5
@@ -750,8 +740,6 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
@@ -765,8 +753,6 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
@@ -783,8 +769,6 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
@@ -793,10 +777,6 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
@@ -994,7 +974,6 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3]
@@ -1306,11 +1285,8 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
@@ -1335,7 +1311,6 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
@@ -1343,7 +1318,6 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: s_mov_b32 s5, 0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0
@@ -1449,8 +1423,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg
; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
; GFX9-O0-NEXT: s_mov_b32 s2, 32
@@ -1490,8 +1462,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg
; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr6
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5
@@ -1602,8 +1572,6 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
@@ -1617,8 +1585,6 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
@@ -1635,8 +1601,6 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7]
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
@@ -1645,10 +1609,6 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11